In [26]:
import pandas as pd
import numpy as np

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

from IPython.display import display
show = display

from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

from sklearn.model_selection import KFold
import sklearn.metrics as skm

# Gender prediction

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import LinearSVC
import joblib


In [29]:
show("# Twitter")

lda, liwc, mtf = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
train = pd.DataFrame()

# TODO add city as feature
for city, city_train in [("NewYork", "NY"), ("London", "London"), ("Singapore", "Singapore")]:
    lda = pd.concat([lda, pd.read_csv(f"data/Features/features{city}/Twitter/LDA50Features.csv", encoding="ISO-8859-15")], axis=0, ignore_index=True)
    liwc = pd.concat([liwc, pd.read_csv(f"data/Features/features{city}/Twitter/LIWCFeatures.csv", encoding="ISO-8859-15")], axis=0, ignore_index=True)
    mtf = pd.concat([mtf, pd.read_csv(f"data/Features/features{city}/Twitter/manuallyDefinedTextFeatures.csv", encoding="ISO-8859-15")], axis=0, ignore_index=True)

    train = pd.concat([train, pd.read_csv(f"data/{city_train}Train.csv")], axis=0, ignore_index=True)

lda = lda.drop_duplicates(subset=["_id"], ignore_index=True)
liwc = liwc.drop_duplicates(subset=["_id"], ignore_index=True)
mtf = mtf.drop_duplicates(subset=["_id"], ignore_index=True)
train = train.drop_duplicates(subset=["row ID"], ignore_index=True)

with joblib.parallel_backend("multiprocessing"): 

    show("## RF")
    for data, feature_name in zip([lda, liwc, mtf], ["LDA50Features", "LIWCFeatures", "ManuallyDefinedTextFeatures"]):
        show(feature_name)
             
        df = data.merge(train, left_on='_id', right_on='row ID')
        df = df.dropna(subset=["gender"])

        Y = df["gender"].values
        X = df.drop([*train.columns, "_id"], axis=1).values
        
        fscore, accuracy = [], []
        fold10 = KFold(n_splits=10, shuffle=True, random_state=42)
        for train_ix, test_ix in fold10.split(X, Y):
            clf = RandomForestClassifier(n_estimators=30, max_depth=300, min_samples_split=2, random_state=42)
            clf.fit(X[train_ix], Y[train_ix])
            y_true, y_pred = Y[test_ix], clf.predict(X[test_ix])
            accuracy.append(skm.accuracy_score(y_true, y_pred))
            fscore.append(skm.f1_score(y_true, y_pred, average="weighted"))

        accuracy = np.array(accuracy).mean(axis=0)
        fscore = np.array(fscore).mean(axis=0)
        show(f"accur: {accuracy}, f1score: {fscore}")

    show("## Ada boost")    
    for data, feature_name in zip([lda, liwc, mtf], ["LDA50Features", "LIWCFeatures", "ManuallyDefinedTextFeatures"]):
        show(feature_name)
        
        df = data.merge(train, left_on='_id', right_on='row ID')
        df = df.dropna(subset=["gender"])

        Y = df["gender"].values
        X = df.drop([*train.columns, "_id"], axis=1).values

        fscore, accuracy = [], []
        fold10 = KFold(n_splits=10, shuffle=True, random_state=42)
        for train_ix, test_ix in fold10.split(X, Y):
            clf = AdaBoostClassifier(n_estimators=50, random_state=42)
            clf.fit(X[train_ix], Y[train_ix])
            y_true, y_pred = Y[test_ix], clf.predict(X[test_ix])
            accuracy.append(skm.accuracy_score(y_true, y_pred))
            fscore.append(skm.f1_score(y_true, y_pred, average="weighted"))

        accuracy = np.array(accuracy).mean(axis=0)
        fscore = np.array(fscore).mean(axis=0)
        show(f"accur: {accuracy}, f1score: {fscore}")
        
    show("## Linear SVC")    
    for data, feature_name in zip([lda, liwc, mtf], ["LDA50Features", "LIWCFeatures", "ManuallyDefinedTextFeatures"]):
        show(feature_name)
                
        df = data.merge(train, left_on='_id', right_on='row ID')
        df = df.dropna(subset=["gender"])

        Y = df["gender"].values
        X = df.drop([*train.columns, "_id"], axis=1).values

        scaler = MinMaxScaler()
        X = scaler.fit_transform(X)
        
        fscore, accuracy = [], []
        fold10 = KFold(n_splits=10, shuffle=True, random_state=42)
        for train_ix, test_ix in fold10.split(X, Y):
            clf = LinearSVC(max_iter=1000)
            clf.fit(X[train_ix], Y[train_ix])
            y_true, y_pred = Y[test_ix], clf.predict(X[test_ix])
            accuracy.append(skm.accuracy_score(y_true, y_pred))
            fscore.append(skm.f1_score(y_true, y_pred, average="weighted"))

        accuracy = np.array(accuracy).mean(axis=0)
        fscore = np.array(fscore).mean(axis=0)
        show(f"accur: {accuracy}, f1score: {fscore}")


'# Twitter'

'## RF'

'LDA50Features'

'accur: 0.7318952807421664, f1score: 0.724711129113902'

'LIWCFeatures'

'accur: 0.6921710398071553, f1score: 0.6814599894179255'

'ManuallyDefinedTextFeatures'

'accur: 0.6652119448486109, f1score: 0.6524614766023734'

'## Ada boost'

'LDA50Features'

'accur: 0.7105725487396736, f1score: 0.7036657072713572'

'LIWCFeatures'

'accur: 0.6888950967929693, f1score: 0.67970883891651'

'ManuallyDefinedTextFeatures'

'accur: 0.6641510427492552, f1score: 0.6469342255292789'

'## Linear SVC'

'LDA50Features'

'accur: 0.699319525246341, f1score: 0.6855360505414763'

'LIWCFeatures'

'accur: 0.693106984303917, f1score: 0.6765390374056139'

'ManuallyDefinedTextFeatures'

'accur: 0.6397842444398254, f1score: 0.581810388189633'

In [30]:
show("# Instagram")

image_concepts = pd.DataFrame()
train = pd.DataFrame()

# TODO add city as feature
for city, city_train in [("NewYork", "NY"), ("London", "London"), ("Singapore", "Singapore")]:
    image_concepts = pd.concat([image_concepts, pd.read_csv(f"data/Features/features{city}/Instagram/imageConceptsFeatures.csv", encoding="ISO-8859-15")], axis=0, ignore_index=True)
  
    train = pd.concat([train, pd.read_csv(f"data/{city_train}Train.csv")], axis=0, ignore_index=True)

image_concepts = image_concepts.drop_duplicates(subset=["_id"], ignore_index=True)
train = train.drop_duplicates(subset=["row ID"], ignore_index=True)

with joblib.parallel_backend("multiprocessing"): 

    show("## RF")

    data = image_concepts
    df = data.merge(train, left_on='_id', right_on='row ID')
    df = df.dropna(subset=["gender"])

    Y = df["gender"].values
    X = df.drop([*train.columns, "_id"], axis=1).values

    fscore, accuracy = [], []
    fold10 = KFold(n_splits=10, shuffle=True, random_state=42)
    for train_ix, test_ix in fold10.split(X, Y):
        clf = RandomForestClassifier(n_estimators=30, max_depth=300, min_samples_split=2, random_state=42)
        clf.fit(X[train_ix], Y[train_ix])
        y_true, y_pred = Y[test_ix], clf.predict(X[test_ix])
        accuracy.append(skm.accuracy_score(y_true, y_pred))
        fscore.append(skm.f1_score(y_true, y_pred, average="weighted"))

    accuracy = np.array(accuracy).mean(axis=0)
    fscore = np.array(fscore).mean(axis=0)
    show(f"accur: {accuracy}, f1score: {fscore}")

    show("## Ada boost")    
        
    fscore, accuracy = [], []
    fold10 = KFold(n_splits=10, shuffle=True, random_state=42)
    for train_ix, test_ix in fold10.split(X, Y):
        clf = AdaBoostClassifier(n_estimators=50, random_state=42)
        clf.fit(X[train_ix], Y[train_ix])
        y_true, y_pred = Y[test_ix], clf.predict(X[test_ix])
        accuracy.append(skm.accuracy_score(y_true, y_pred))
        fscore.append(skm.f1_score(y_true, y_pred, average="weighted"))

    accuracy = np.array(accuracy).mean(axis=0)
    fscore = np.array(fscore).mean(axis=0)
    show(f"accur: {accuracy}, f1score: {fscore}")
        
    show("## Linear SVC")    
     
    scaler = MinMaxScaler()
    X = scaler.fit_transform(X)
        
    fscore, accuracy = [], []
    fold10 = KFold(n_splits=10, shuffle=True, random_state=42)
    for train_ix, test_ix in fold10.split(X, Y):
        clf = LinearSVC(max_iter=1000)
        clf.fit(X[train_ix], Y[train_ix])
        y_true, y_pred = Y[test_ix], clf.predict(X[test_ix])
        accuracy.append(skm.accuracy_score(y_true, y_pred))
        fscore.append(skm.f1_score(y_true, y_pred, average="weighted"))

    accuracy = np.array(accuracy).mean(axis=0)
    fscore = np.array(fscore).mean(axis=0)
    show(f"accur: {accuracy}, f1score: {fscore}")


'# Instagram'

'## RF'

'accur: 0.673686750274058, f1score: 0.6689067642636951'

'## Ada boost'

'accur: 0.5866802515564926, f1score: 0.5566290042688148'

'## Linear SVC'



'accur: 0.5900872265319717, f1score: 0.4700363761773512'

In [31]:
show("# Foursquare")

lda6 = pd.DataFrame()
train = pd.DataFrame()

# TODO add city as feature
for city, city_train in [("NewYork", "NY"), ("London", "London"), ("Singapore", "Singapore")]:
    lda6 = pd.concat([lda6, pd.read_csv(f"data/Features/features{city}/Foursquare/venueCategoriesLDA6Features.csv", encoding="ISO-8859-15")], axis=0, ignore_index=True)

    train = pd.concat([train, pd.read_csv(f"data/{city_train}Train.csv")], axis=0, ignore_index=True)

lda6 = lda6.drop_duplicates(subset=["_id"], ignore_index=True)
train = train.drop_duplicates(subset=["row ID"], ignore_index=True)

with joblib.parallel_backend("multiprocessing"): 

    show("## RF")
    
    data = lda6
    df = data.merge(train, left_on='_id', right_on='row ID')
    df = df.dropna(subset=["gender"])

    Y = df["gender"].values
    X = df.drop([*train.columns, "_id"], axis=1).values

    fscore, accuracy = [], []
    fold10 = KFold(n_splits=10, shuffle=True, random_state=42)
    for train_ix, test_ix in fold10.split(X, Y):
        clf = RandomForestClassifier(n_estimators=30, max_depth=300, min_samples_split=2, random_state=42)
        clf.fit(X[train_ix], Y[train_ix])
        y_true, y_pred = Y[test_ix], clf.predict(X[test_ix])
        accuracy.append(skm.accuracy_score(y_true, y_pred))
        fscore.append(skm.f1_score(y_true, y_pred, average="weighted"))

    accuracy = np.array(accuracy).mean(axis=0)
    fscore = np.array(fscore).mean(axis=0)
    show(f"accur: {accuracy}, f1score: {fscore}")

    show("## Ada boost")    
        
    fscore, accuracy = [], []
    fold10 = KFold(n_splits=10, shuffle=True, random_state=42)
    for train_ix, test_ix in fold10.split(X, Y):
        clf = AdaBoostClassifier(n_estimators=50, random_state=42)
        clf.fit(X[train_ix], Y[train_ix])
        y_true, y_pred = Y[test_ix], clf.predict(X[test_ix])
        accuracy.append(skm.accuracy_score(y_true, y_pred))
        fscore.append(skm.f1_score(y_true, y_pred, average="weighted"))

    accuracy = np.array(accuracy).mean(axis=0)
    fscore = np.array(fscore).mean(axis=0)
    show(f"accur: {accuracy}, f1score: {fscore}")
        
    show("## Linear SVC")    

    scaler = MinMaxScaler()
    X = scaler.fit_transform(X)
        
    fscore, accuracy = [], []
    fold10 = KFold(n_splits=10, shuffle=True, random_state=42)
    for train_ix, test_ix in fold10.split(X, Y):
        clf = LinearSVC(max_iter=1000)
        clf.fit(X[train_ix], Y[train_ix])
        y_true, y_pred = Y[test_ix], clf.predict(X[test_ix])
        accuracy.append(skm.accuracy_score(y_true, y_pred))
        fscore.append(skm.f1_score(y_true, y_pred, average="weighted"))

    accuracy = np.array(accuracy).mean(axis=0)
    fscore = np.array(fscore).mean(axis=0)
    show(f"accur: {accuracy}, f1score: {fscore}")


'# Foursquare'

'## RF'

'accur: 0.6102399301542154, f1score: 0.5973504087991597'

'## Ada boost'

'accur: 0.636132426948832, f1score: 0.6003453292626075'

'## Linear SVC'

'accur: 0.6234775458431601, f1score: 0.5502709435409617'