In [3]:
import os
import itertools
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 50)

import nltk 
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize, sent_tokenize 
stop_words = set(stopwords.words('english')) 

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [3]:
# Data 

# Kagle notebook
# df = pd.read_csv('../input/gender-classifier-DFE-791531.csv', encoding='latin1')

# Jupyter notebook
df = pd.read_csv('twitter_gender_classifier.csv', encoding='latin1')

In [4]:
# Exploration

print(df.shape) 
display(df.head(5))

# [print(column) for column in df.columns]
for column in df.columns: # value counts, nans, type
    print('{:25} {:5} {:5} {}'.format(column, len(df[column].value_counts()), len(df[df[column].isna() == True]), df[column].dtype ))

display(df.info())
display(df.describe())

display(df.gender.value_counts()) # pick column

display(df.tweet_count.plot()) # pick numeric column
display(df.tweet_count.hist()) # pick numeric column

(20050, 26)


Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,gender,gender:confidence,profile_yn,profile_yn:confidence,created,description,fav_number,gender_gold,link_color,name,profile_yn_gold,profileimage,retweet_count,sidebar_color,text,tweet_coord,tweet_count,tweet_created,tweet_id,tweet_location,user_timezone
0,815719226,False,finalized,3,10/26/15 23:24,male,1.0,yes,1.0,12/5/13 1:48,i sing my own rhythm.,0,,08C2C2,sheezy0,,https://pbs.twimg.com/profile_images/414342229...,0,FFFFFF,Robbie E Responds To Critics After Win Against...,,110964,10/26/15 12:40,6.5873e+17,main; @Kan1shk3,Chennai
1,815719227,False,finalized,3,10/26/15 23:30,male,1.0,yes,1.0,10/1/12 13:51,I'm the author of novels filled with family dr...,68,,0084B4,DavdBurnett,,https://pbs.twimg.com/profile_images/539604221...,0,C0DEED,ÛÏIt felt like they were my friends and I was...,,7471,10/26/15 12:40,6.5873e+17,,Eastern Time (US & Canada)
2,815719228,False,finalized,3,10/26/15 23:33,male,0.6625,yes,1.0,11/28/14 11:30,louis whining and squealing and all,7696,,ABB8C2,lwtprettylaugh,,https://pbs.twimg.com/profile_images/657330418...,1,C0DEED,i absolutely adore when louis starts the songs...,,5617,10/26/15 12:40,6.5873e+17,clcncl,Belgrade
3,815719229,False,finalized,3,10/26/15 23:10,male,1.0,yes,1.0,6/11/09 22:39,"Mobile guy. 49ers, Shazam, Google, Kleiner Pe...",202,,0084B4,douggarland,,https://pbs.twimg.com/profile_images/259703936...,0,C0DEED,Hi @JordanSpieth - Looking at the url - do you...,,1693,10/26/15 12:40,6.5873e+17,"Palo Alto, CA",Pacific Time (US & Canada)
4,815719230,False,finalized,3,10/27/15 1:15,female,1.0,yes,1.0,4/16/14 13:23,Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T...,37318,,3B94D9,WilfordGemma,,https://pbs.twimg.com/profile_images/564094871...,0,0,Watching Neighbours on Sky+ catching up with t...,,31462,10/26/15 12:40,6.5873e+17,,


_unit_id                  20050     0 int64
_golden                       2     0 bool
_unit_state                   2     0 object
_trusted_judgments           32     0 int64
_last_judgment_at           283    50 object
gender                        4    97 object
gender:confidence           923    26 float64
profile_yn                    2     0 object
profile_yn:confidence       267     0 float64
created                   18699     0 object
description               15140  3744 object
fav_number                 6784     0 int64
gender_gold                   6 20000 object
link_color                 3001     0 object
name                      18795     0 object
profile_yn_gold               1 20000 object
profileimage              17164     0 object
retweet_count                22     0 int64
sidebar_color               561     0 object
text                      18412     0 object
tweet_coord                 146 19891 object
tweet_count               14280     0 int64
tweet_created  

None

Unnamed: 0,_unit_id,_trusted_judgments,gender:confidence,profile_yn:confidence,fav_number,retweet_count,tweet_count,tweet_id
count,20050.0,20050.0,20024.0,20050.0,20050.0,20050.0,20050.0,20050.0
mean,815729400.0,3.615711,0.882756,0.993221,4382.201646,0.079401,38924.69,6.58735e+17
std,6000.801,12.33189,0.191403,0.047168,12518.575919,2.649751,116837.1,5000124000000.0
min,815719200.0,3.0,0.0,0.6272,0.0,0.0,1.0,6.5873e+17
25%,815724300.0,3.0,0.6778,1.0,11.0,0.0,2398.0,6.5873e+17
50%,815729400.0,3.0,1.0,1.0,456.0,0.0,11441.5,6.5873e+17
75%,815734500.0,3.0,1.0,1.0,3315.5,0.0,40027.5,6.5874e+17
max,815758000.0,274.0,1.0,1.0,341621.0,330.0,2680199.0,6.5874e+17


female     6700
male       6194
brand      5942
unknown    1117
Name: gender, dtype: int64

<matplotlib.axes._subplots.AxesSubplot at 0x7feb367ed9b0>

<matplotlib.axes._subplots.AxesSubplot at 0x7feb367ed9b0>

In [5]:
# Delete NaN/monolytic/unhelpful features
del df['gender_gold']
del df['profile_yn_gold']
del df['tweet_coord']

# Fill NaN with unknown (as it truly is.)
df.gender = df.gender.fillna('unknown')

# Recode gender to numeric
df['gender'] = df['gender'].map({"female":0, "male":1, "brand":2,"unknown":3}).apply(lambda x: int(x))

# Fill non values.
df['gender:confidence'] = df['gender:confidence'].fillna(0.0) # n=25. We got truly 0.0 confidence here. 
df['_last_judgment_at'] = df['_last_judgment_at'].fillna(df['_last_judgment_at'].mode().iloc[0]) # n=50. Fill with mode, common sence, low impact.

columns_to_recode = ['_golden', '_unit_state', 'profile_yn'] # Recode columns to numeric
le =  LabelEncoder()
for column in columns_to_recode:
    df[column] = le.fit_transform(df[column])

In [6]:
# Get part of speech in tweet as a features 

# tweet as a POS string fg. " NN VBP JJ "
def tweet_pos(tweet):
    words = nltk.word_tokenize(tweet) 
    legit_words = [w for w in words if not w in stop_words]
    pos = [w[1] for w in nltk.pos_tag(legit_words)]
    return ' ' + ' '.join(pos) + ' '

# Get all n>30 POS types
def get_valid_pos_names(pos_column):
    all_pos = list(np.concatenate([v[0].split() for v in pos_column.str.split(';\s')]))
    return np.unique([i for i in all_pos if all_pos.count(i)>30])

df['pos'] = df.text.apply(tweet_pos)
df['pos'] = df.pos.apply(lambda x: ''.join([w for w in x if w.isalpha() or w.isspace()]))

for pos in get_valid_pos_names(df.pos):
    df[pos] = df.pos.str.count(' {} '.format(pos))

del df['pos']

pd.read_csv('twitter_gender_classifier.csv', encoding='latin1')

In [7]:
# Support feature. Male, Female = 1, Brand, Unknown = 0.
df['is_human'] = df['gender'].map({0:1,1:1,2:0,3:0})

# Move y(gender) column to be the last one.
df.g = df['gender']
del df['gender']
df['gender'] = df.g

  """


In [8]:
# Problem simplification for baseline model purpose
print(df.shape)
df = df[(df.gender == 0) | (df.gender==1)]
print(df.shape)

(20050, 56)
(12894, 56)


In [9]:
# Features to use for baseline model
features = [ '_trusted_judgments', 'retweet_count', 'tweet_count']
features = pd.concat([df.loc[:, features],df.iloc[:,23:55]],axis=1) # features + POS features.

X = features
y = df['gender']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [10]:
LGR = LogisticRegression(penalty='l1', 
                         C=1.2, 
                         solver='liblinear').fit(X_train, y_train)

accuracy_train = round(LGR.score(X_train, y_train), 4)
accuracy_test  = round(LGR.score(X_test,  y_test), 4)
predictions = LGR.predict(X_test)
probabilities = LGR.predict_proba(X_test)
tn, fp, fn, tp  = confusion_matrix(y_test, predictions).ravel()

print('Accuracy - train: {}'.format(accuracy_train))
print('Accuracy - test:  {}'.format(accuracy_test))
print('\nConfusion matrix:\nTP: {:<5} TN: {}\nFP: {:<5} FN: {}'.format(tp, tn, fp, fn))
print('\nClassification report: \n\n', classification_report(y_test, predictions))

Accuracy - train: 0.5702
Accuracy - test:  0.5624

Confusion matrix:
TP: 832   TN: 1344
FP: 716   FN: 977

Classification report: 

               precision    recall  f1-score   support

           0       0.58      0.65      0.61      2060
           1       0.54      0.46      0.50      1809

   micro avg       0.56      0.56      0.56      3869
   macro avg       0.56      0.56      0.55      3869
weighted avg       0.56      0.56      0.56      3869



In [11]:
def add_features(df, split):
    y = df.iloc[:,split:]
    X = df.iloc[:,:split]
    new_X = new_features(X)
    
    return pd.concat( [new_X,y],axis=1)

def new_features(X):
    
    # Text features
    X['tweet_length'] = X.text.apply(lambda x: len(x))
    X['words_number'] = X.text.apply(lambda x: len(x.split(' ')))
    
    # Description features
    X['desc_length'] = X.description.apply(lambda x: len(str(x)))
    X['desc_words_number'] = X.description.apply(lambda x: len(str(x).split(' ')))
    
    # Word features
    positive_words = ['good', 'discover', 'achieve', 'create', 'fresh', 'happy', 'love', 'will']
    negative_words = ['bad', 'evil', 'lame', 'wrong', 'blame', 'never', 'less']
    
    # Some words out of blue
    man_mby = ['sport','football','won','first']
    woman_mby = ['care', 'metoo', 'happy','scared','angry']
    human_mby = ['i', 'me','my','fuck', 'damn', 'shit', 'lol', 'rofl', 'wtf']
    org_mby = ['we', 'our']
    gender_related = ['she','her','his','him','he']
    
    # internet list of words recognised by genders:
    # https://www.businessinsider.com/gender-and-vocabulary-analysis-2014-6?IR=T
    reco_man = ['codec','solenoid','golem','mach','humvee','ymore','scimitar','kevlar','paladin','bolshevism','biped','dreadnought']
    reco_woman = ['taffeta','tresses','bottlebrush','flouncy','mascarpone','decoupage','progesterone','wisteria','taupe','flouncing','peony','bodice']
    
    # words used by gender 
    gender_used =  ['chief', 'pillows','rear','lace','civil','curls','bigger','dress','absolutely','china','enemy','skirt','fellows','curtains','king','cups','public','sheets','contact','shrugged']

    for word in itertools.chain(reco_man, gender_used, reco_woman, positive_words, negative_words, man_mby, woman_mby, gender_related, human_mby, org_mby):
        X[word] = X.text.apply(lambda x: 1 if word in str(x) else 0)
        X['d_' + word] = X.description.apply(lambda x: 1 if word in str(x) else 0)
    
    # Add
    
    
    
    return X

# Add bag of words features
df.text = df.text.str.lower()
df.description = df.description.str.lower()
df = add_features(df, 22)
df = add_features(df, 22)
df = df.T.drop_duplicates().T
print(df.shape)
df.head(1)

(12894, 175)


Unnamed: 0,_unit_id,_golden,_trusted_judgments,_last_judgment_at,gender:confidence,profile_yn,profile_yn:confidence,created,description,fav_number,link_color,name,profileimage,retweet_count,sidebar_color,text,tweet_count,tweet_created,tweet_id,tweet_location,user_timezone,tweet_length,words_number,desc_length,desc_words_number,...,JJS,MD,NN,NNP,NNPS,NNS,POS,PRP,RB,RBR,RBS,RP,SYM,TO,UH,VB,VBD,VBG,VBN,VBP,VBZ,WDT,WP,WRB,gender
0,815719226,0,3,10/26/15 23:24,1,1,1,12/5/13 1:48,i sing my own rhythm.,0,08C2C2,sheezy0,https://pbs.twimg.com/profile_images/414342229...,0,FFFFFF,robbie e responds to critics after win against...,110964,10/26/15 12:40,6.5873e+17,main; @Kan1shk3,Chennai,109,14,21,5,...,0,0,2,4,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1


In [12]:
# Sanity save copy
df_copy_save = df.copy(deep=True)
df_copy_save.to_csv('POSwordskagglebaseline.csv', index=False)

In [5]:
# Sanity recovery
df = pd.read_csv('POSwordskagglebaseline.csv', encoding='latin1')

In [6]:
# Problem simplification for baseline model purpose
print(df.shape)
df = df[(df.gender == 0) | (df.gender==1)]
print(df.shape)

# Features to use for baseline model
features = [ '_trusted_judgments', 'retweet_count', 'tweet_count']
features = pd.concat([df.loc[:, features],df.iloc[:,21:174]],axis=1) #  + BOW, POS features.

X = features
y = df['gender'].astype('int')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=17)

LGR = LogisticRegression(penalty='l1', 
                             C=0.15, 
                             solver='liblinear')

LGR.fit(X_train, y_train)
accuracy_train = round(LGR.score(X_train, y_train), 4)
accuracy_test  = round(LGR.score(X_test,  y_test), 4)
predictions = LGR.predict(X_test)
probabilities = LGR.predict_proba(X_test)
tn, fp, fn, tp  = confusion_matrix(y_test, predictions).ravel()

print('Accuracy - train: {}'.format(accuracy_train))
print('Accuracy - test:  {}'.format(accuracy_test))
print('\nConfusion matrix:\nTP: {:<5} TN: {}\nFP: {:<5} FN: {}'.format(tp, tn, fp, fn))
print('\nClassification report: \n\n', classification_report(y_test, predictions))

(12894, 175)
(12894, 175)
Accuracy - train: 0.6096
Accuracy - test:  0.6004

Confusion matrix:
TP: 956   TN: 1367
FP: 623   FN: 923

Classification report: 

               precision    recall  f1-score   support

           0       0.60      0.69      0.64      1990
           1       0.61      0.51      0.55      1879

   micro avg       0.60      0.60      0.60      3869
   macro avg       0.60      0.60      0.60      3869
weighted avg       0.60      0.60      0.60      3869



In [8]:

from sklearn.tree import DecisionTreeClassifier
DTC = DecisionTreeClassifier(max_depth=32, max_features=40, min_samples_leaf=1200).fit(X_train, y_train)

accuracy_train = round(DTC.score(X_train, y_train), 4)
accuracy_test  = round(DTC.score(X_test,  y_test), 4)
predictions = DTC.predict(X_test)
probabilities = DTC.predict_proba(X_test)
tn, fp, fn, tp  = confusion_matrix(y_test, predictions).ravel()

print('Accuracy - train: {}'.format(accuracy_train))
print('Accuracy - test:  {}'.format(accuracy_test))
print('\nConfusion matrix:\nTP: {:<5} TN: {}\nFP: {:<5} FN: {}'.format(tp, tn, fp, fn))
print('\nClassification report: \n\n', classification_report(y_test, predictions))

Accuracy - train: 0.5535
Accuracy - test:  0.5407

Confusion matrix:
TP: 1029  TN: 1063
FP: 927   FN: 850

Classification report: 

               precision    recall  f1-score   support

           0       0.56      0.53      0.54      1990
           1       0.53      0.55      0.54      1879

   micro avg       0.54      0.54      0.54      3869
   macro avg       0.54      0.54      0.54      3869
weighted avg       0.54      0.54      0.54      3869



In [22]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=3000, max_depth=400, max_features=100, random_state=0).fit(X_train, y_train)

accuracy_train = round(clf.score(X_train, y_train), 4)
accuracy_test  = round(clf.score(X_test,  y_test), 4)
predictions = clf.predict(X_test)
probabilities = clf.predict_proba(X_test)
tn, fp, fn, tp  = confusion_matrix(y_test, predictions).ravel()

print('Accuracy - train: {}'.format(accuracy_train))
print('Accuracy - test:  {}'.format(accuracy_test))
print('\nConfusion matrix:\nTP: {:<5} TN: {}\nFP: {:<5} FN: {}'.format(tp, tn, fp, fn))
print('\nClassification report: \n\n', classification_report(y_test, predictions))

Accuracy - train: 0.9987
Accuracy - test:  0.6074

Confusion matrix:
TP: 1074  TN: 1276
FP: 714   FN: 805

Classification report: 

               precision    recall  f1-score   support

           0       0.61      0.64      0.63      1990
           1       0.60      0.57      0.59      1879

   micro avg       0.61      0.61      0.61      3869
   macro avg       0.61      0.61      0.61      3869
weighted avg       0.61      0.61      0.61      3869



In [68]:

from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier(algorithm='SAMME.R', 
                         base_estimator=DecisionTreeClassifier(max_depth=3),
                         learning_rate=0.16, 
                         n_estimators=60, 
                         random_state=17).fit(X_train, y_train)

accuracy_train = round(clf.score(X_train, y_train), 4)
accuracy_test  = round(clf.score(X_test,  y_test), 4)
predictions = clf.predict(X_test)
probabilities = clf.predict_proba(X_test)
tn, fp, fn, tp  = confusion_matrix(y_test, predictions).ravel()

print('Accuracy - train: {}'.format(accuracy_train))
print('Accuracy - test:  {}'.format(accuracy_test))
print('\nConfusion matrix:\nTP: {:<5} TN: {}\nFP: {:<5} FN: {}'.format(tp, tn, fp, fn))
print('\nClassification report: \n\n', classification_report(y_test, predictions))

results = []
for a in ['SAMME.R']: # , 'SAMME'
    for be in [DecisionTreeClassifier(max_depth=3)]: # , DecisionTreeClassifier(max_depth=2)
        for lr in [0.16]: # 0.155,0.156,0.157, 0.158, 0.159, 0,0.161,0.162,0.163,0.164,0.166
            for ne in [60]: # 24,32,36,42,58, 59,60,61, 62,63
                #print('\n---',a,be,lr,ne)
                
                clf = AdaBoostClassifier(algorithm=a, 
                                         base_estimator=be,
                                         learning_rate=lr, 
                                         n_estimators=ne, 
                                         random_state=17).fit(X_train, y_train)

                accuracy_train = round(clf.score(X_train, y_train), 4)
                accuracy_test  = round(clf.score(X_test,  y_test), 4)
                predictions = clf.predict(X_test)
                probabilities = clf.predict_proba(X_test)
                tn, fp, fn, tp  = confusion_matrix(y_test, predictions).ravel()
                
                if accuracy_test > 0: #.6125:
                    print()
                    print('Accuracy - test:  {}    '.format(accuracy_test), a,be,lr,ne)
                    results.append([accuracy_test, a,be,lr,ne])

Accuracy - train: 0.6566
Accuracy - test:  0.6149

Confusion matrix:
TP: 1056  TN: 1323
FP: 667   FN: 823

Classification report: 

               precision    recall  f1-score   support

           0       0.62      0.66      0.64      1990
           1       0.61      0.56      0.59      1879

   micro avg       0.61      0.61      0.61      3869
   macro avg       0.61      0.61      0.61      3869
weighted avg       0.61      0.61      0.61      3869


Accuracy - test:  0.6154     SAMME.R DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best') 0.16 60


In [69]:
for r in results:
    if r[0] > 0.61:
        print(r[0],r[3:])

0.6154 [0.16, 60]


In [11]:
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

import time

end = time.time()
classifiers = [
    #KNeighborsClassifier(3),
    ##SVC(kernel="linear", C=0.025),
    #SVC(gamma=2, C=1),
    #GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]

for clf in classifiers:
    
    start = time.time()
    print()
    print(clf)
    clf.fit(X_train, y_train)
    accuracy_train = round(clf.score(X_train, y_train), 4)
    accuracy_test  = round(clf.score(X_test,  y_test), 4)
    predictions = clf.predict(X_test)
    tn, fp, fn, tp  = confusion_matrix(y_test, predictions).ravel()

    print('Accuracy - train: {}'.format(accuracy_train))
    print('Accuracy - test:  {}'.format(accuracy_test))
    print('\nConfusion matrix:\nTP: {:<5} TN: {}\nFP: {:<5} FN: {}'.format(tp, tn, fp, fn))
    print('\nClassification report: \n\n', classification_report(y_test, predictions))
    
    print('Took:', time.time() - start)
    print('----\n')


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
Accuracy - train: 0.5998
Accuracy - test:  0.5653

Confusion matrix:
TP: 1096  TN: 1091
FP: 899   FN: 783

Classification report: 

               precision    recall  f1-score   support

           0       0.58      0.55      0.56      1990
           1       0.55      0.58      0.57      1879

   micro avg       0.57      0.57      0.57      3869
   macro avg       0.57      0.57      0.57      3869
weighted avg       0.57      0.57      0.57      3869

Took: 0.06930279731750488
----


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features=1, max_leaf_nodes=None,
            min_impur



In [None]:
98+0