# Analysing



In [1]:
import pandas as pd
import scipy.stats as scs
from sklearn import ensemble
from sklearn import linear_model
from sklearn import metrics
from sklearn import model_selection
from sklearn import neighbors
from sklearn import svm

## Stats

In [2]:
df = pd.read_csv('data/dc-wikia-data-clean.csv')

### t-test example: do male characters have more appearances than female characters?

In [3]:
appearances = df['appearances']
female_appearances = appearances[df['sex'] == 'Female'].dropna()
male_appearances = appearances[df['sex'] == 'Male'].dropna()

In [4]:
female_appearances.describe()

count    1880.000000
mean       22.484574
std        68.717080
min         1.000000
25%         3.000000
50%         6.000000
75%        16.250000
max      1231.000000
Name: appearances, dtype: float64

In [5]:
male_appearances.describe()

count    4527.00000
mean       24.49989
std        95.16820
min         1.00000
25%         2.00000
50%         6.00000
75%        15.00000
max      3093.00000
Name: appearances, dtype: float64

In [6]:
scs.ttest_ind(female_appearances, male_appearances)

Ttest_indResult(statistic=-0.8324593653567558, pvalue=0.40518074246540314)

### Chi-square example: do male and female align similarly (good / bad / neutral)?

In [11]:
(
    df[df['sex'].isin(['Male', 'Female'])]
    .groupby(['sex', 'align'])
    .agg({'page_id': 'count'})
    .unstack(level='align')
)

Unnamed: 0_level_0,page_id,page_id,page_id,page_id
align,Bad,Good,Neutral,Reformed
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Female,597,953,196,1
Male,2223,1843,359,2


In [14]:
frequencies = (
    df[df['sex'].isin(['Male', 'Female'])]
    .groupby(['sex', 'align'])
    .count()
    ['page_id']
    .unstack(level='align')
)

In [15]:
chi2, p, _, _ = scs.chi2_contingency(frequencies)
chi2, p

(130.26206661662422, 4.7489447169860158e-28)

*Note: take a look at `scipy.stats`, if you need more powerful tools check out [StatsModels](http://www.statsmodels.org/stable/index.html).*

## Machine learning

Can we know the character alignment by eye color, hair color, sex and id?

In [17]:
df['align'].value_counts()

Bad         2895
Good        2832
Neutral      565
Reformed       3
Name: align, dtype: int64

In [25]:
df = pd.read_csv('data/dc-wikia-data-clean.csv')


def simplify_align(align):
    if align in ['Good', 'Bad']:
        return align
    return 'Other'
    

df['align'] = df['align'].apply(simplify_align)
df = df.dropna(subset=['eye', 'hair', 'sex', 'id', 'align'])

for col in ['eye', 'hair', 'align', 'sex', 'id']:
    df[f'{col}_code'] = df[col].apply(hash)

In [28]:
X = df[['eye_code', 'hair_code', 'sex_code', 'id_code']]
y = df['align_code']

In [29]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y)

In [34]:
# model = linear_model.SGDClassifier(max_iter=1000, n_jobs=-1)
# model = ensemble.RandomForestClassifier(n_estimators=1000, n_jobs=-1)
# model = neighbors.KNeighborsClassifier(n_neighbors=20, n_jobs=-1)
model = svm.LinearSVC()

In [35]:
%%time

model.fit(X_train, y_train)

CPU times: user 198 ms, sys: 2.67 ms, total: 200 ms
Wall time: 200 ms


LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [36]:
predictions = model.predict(X_test)
metrics.accuracy_score(predictions, y_test)

0.28694158075601373

In [37]:
pd.Series(predictions).value_counts()

 812200802180742890     314
-2237280582048473783    159
 8563152435839049822    109
dtype: int64