# Analysing



In [None]:
import pandas as pd
import scipy.stats as scs
from sklearn import ensemble
from sklearn import linear_model
from sklearn import metrics
from sklearn import model_selection
from sklearn import neighbors
from sklearn import svm

## Stats

In [None]:
df = pd.read_csv('data/dc-wikia-data-clean.csv')

### t-test example: do male characters have more appearances than female characters?

In [None]:
appearances = df['appearances']
female_appearances = appearances[df['sex'] == 'Female'].dropna()
male_appearances = appearances[df['sex'] == 'Male'].dropna()

In [None]:
female_appearances.describe()

In [None]:
male_appearances.describe()

In [None]:
scs.ttest_ind(female_appearances, male_appearances)

### Chi-square example: do male and female align similarly (good / bad / neutral)?

In [None]:
frequencies = (
    df[df['sex'].isin(['Female', 'Male'])]
    .pivot_table(index='sex', columns='align', values='page_id', aggfunc='count')
)
frequencies

In [None]:
chi2, p, _, _ = scs.chi2_contingency(frequencies)
chi2, p

*Note: take a look at `scipy.stats`, if you need more powerful tools check out [StatsModels](http://www.statsmodels.org/stable/index.html).*

## Machine learning

A good place to start if you're not sure what you're doing: [choosing the right estimator](https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html).

Can we know the character alignment by eye color, hair color, ?

In [None]:
df = pd.read_csv('data/dc-wikia-data-clean.csv')

categorical_columns = ['align', 'eye', 'hair', 'sex', 'gsm', 'id']

for col in categorical_columns:
    df[f'{col}_code'] = df[col].astype('category').cat.codes

In [None]:
df = df[df['align'].isin(['Good', 'Bad'])]
X = df[['eye_code', 'hair_code', 'sex_code', 'gsm_code', 'id_code']]
y = df['align_code']

In [None]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y)

In [None]:
# model = linear_model.SGDClassifier(max_iter=1000, n_jobs=-1)
# model = ensemble.RandomForestClassifier(n_estimators=1000, n_jobs=-1)
model = neighbors.KNeighborsClassifier(n_neighbors=100, n_jobs=-1)
# model = svm.LinearSVC()

In [None]:
%%time

model.fit(X_train, y_train)

In [None]:
%%time

predictions = model.predict(X_test)

In [None]:
metrics.accuracy_score(predictions, y_test)

In [None]:
metrics.recall_score(predictions, y_test)

In [None]:
metrics.f1_score(predictions, y_test)