# Analysing



In [None]:
import pandas as pd
import scipy.stats as scs
from sklearn import ensemble
from sklearn import linear_model
from sklearn import metrics
from sklearn import model_selection
from sklearn import neighbors
from sklearn import svm

## Stats

In [None]:
df = pd.read_csv('data/dc-wikia-data-clean.csv')

### t-test example: do male characters have more appearances than female characters?

In [None]:
appearances = df['appearances']
female_appearances = appearances[df['sex'] == 'Female'].dropna()
male_appearances = appearances[df['sex'] == 'Male'].dropna()

In [None]:
female_appearances.describe()

In [None]:
male_appearances.describe()

In [None]:
scs.ttest_ind(female_appearances, male_appearances)

### Chi-square example: do male and female align similarly (good / bad / neutral)?

In [None]:
frequencies = (
    df[df['sex'].isin(['Male', 'Female'])]
    .groupby(['sex', 'align'])
    .count()
    ['page_id']
    .unstack(level='align')
)

In [None]:
chi2, p, _, _ = scs.chi2_contingency(frequencies)
chi2, p

*Note: take a look at `scipy.stats`, if you need more powerful tools check out [StatsModels](http://www.statsmodels.org/stable/index.html).*

## Machine learning

Can we know the character alignment by eye color, hair color, ?

In [None]:
df = pd.read_csv('data/dc-wikia-data-clean.csv')


def simplify_align(align):
    if align in ['Good', 'Bad']:
        return align
    return 'Other'
    

df['align'] = df['align'].apply(simplify_align)
df = df.dropna(subset=['eye', 'hair', 'sex', 'id', 'align'])

for col in ['eye', 'hair', 'align', 'sex', 'id']:
    df[f'{col}_code'] = df[col].apply(hash) % 10

In [None]:
X = df[['eye_code', 'hair_code', 'sex_code', 'id_code']]
y = df['align_code']

In [None]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y)

In [None]:
# model = linear_model.SGDClassifier(max_iter=1000, n_jobs=-1)
model = ensemble.RandomForestClassifier(n_estimators=1000, n_jobs=-1)
# model = neighbors.KNeighborsClassifier(n_neighbors=20, n_jobs=-1)
# model = svm.LinearSVC()

In [None]:
%%time

model.fit(X_train, y_train)

In [None]:
predictions = model.predict(X_test)
metrics.accuracy_score(predictions, y_test)

In [None]:
pd.Series(predictions).value_counts()