# Analysing



In [1]:
import pandas as pd
import scipy.stats as scs
from sklearn import ensemble
from sklearn import linear_model
from sklearn import metrics
from sklearn import model_selection
from sklearn import neighbors
from sklearn import svm

## Stats

In [2]:
df = pd.read_csv('data/dc-wikia-data-clean.csv')

### t-test example: do male characters have more appearances than female characters?

In [3]:
appearances = df['appearances']
female_appearances = appearances[df['sex'] == 'Female'].dropna()
male_appearances = appearances[df['sex'] == 'Male'].dropna()

In [4]:
female_appearances.describe()

count    1880.000000
mean       22.484574
std        68.717080
min         1.000000
25%         3.000000
50%         6.000000
75%        16.250000
max      1231.000000
Name: appearances, dtype: float64

In [5]:
male_appearances.describe()

count    4527.00000
mean       24.49989
std        95.16820
min         1.00000
25%         2.00000
50%         6.00000
75%        15.00000
max      3093.00000
Name: appearances, dtype: float64

In [6]:
scs.ttest_ind(female_appearances, male_appearances)

Ttest_indResult(statistic=-0.8324593653567558, pvalue=0.40518074246540314)

### Chi-square example: do male and female align similarly (good / bad / neutral)?

In [41]:
frequencies = (
    df[df['sex'].isin(['Female', 'Male'])]
    .pivot_table(index='sex', columns='align', values='page_id', aggfunc='count')
)
frequencies

align,Bad,Good,Neutral,Reformed
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,597,953,196,1
Male,2223,1843,359,2


In [8]:
chi2, p, _, _ = scs.chi2_contingency(frequencies)
chi2, p

(130.26206661662422, 4.748944716986016e-28)

*Note: take a look at `scipy.stats`, if you need more powerful tools check out [StatsModels](http://www.statsmodels.org/stable/index.html).*

## Machine learning

A good place to start if you're not sure what you're doing: [choosing the right estimator](https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html).

Can we know the character alignment by eye color, hair color, ?

In [44]:
df = pd.read_csv('data/dc-wikia-data-clean.csv')

categorical_columns = ['align', 'eye', 'hair', 'sex', 'gsm', 'id']

for col in categorical_columns:
    df[f'{col}_code'] = df[col].astype('category').cat.codes

In [46]:
df = df[df['align'].isin(['Good', 'Bad'])]
X = df[['eye_code', 'hair_code', 'sex_code', 'gsm_code', 'id_code']]
y = df['align_code']

In [47]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y)

In [48]:
# model = linear_model.SGDClassifier(max_iter=1000, n_jobs=-1)
# model = ensemble.RandomForestClassifier(n_estimators=1000, n_jobs=-1)
model = neighbors.KNeighborsClassifier(n_neighbors=100, n_jobs=-1)
# model = svm.LinearSVC()

In [49]:
%%time

model.fit(X_train, y_train)

CPU times: user 6.34 ms, sys: 2.62 ms, total: 8.96 ms
Wall time: 8.07 ms


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=100, p=2,
                     weights='uniform')

In [50]:
%%time

predictions = model.predict(X_test)

CPU times: user 128 ms, sys: 6.65 ms, total: 135 ms
Wall time: 83.1 ms


In [51]:
metrics.accuracy_score(predictions, y_test)

0.5942737430167597

In [52]:
metrics.recall_score(predictions, y_test)

0.5788804071246819

In [53]:
metrics.f1_score(predictions, y_test)

0.6103286384976526