In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# HELPER LIST

raw_type_list = ['User_ID', 'Gender', 'Age', 'Occupation', 'Star_Sign', 'Date', 'Text']
top_word_list = ['anyways', 'cuz', 'digest', 'diva', 'evermean', 'fox', 'gonna', 'greg', 'haha', 'jayel',
                 'kinda', 'levengals', 'literacy', 'lol', 'melissa', 'nan', 'nat', 'postcount', 'ppl', 'rick',
                 'school', 'shep', 'sherry', 'spanners', 'teri', 'u', 'ur', 'urllink', 'wanna', 'work']
top_type_list = ['Instance_ID'] + top_word_list + ['Class']
raw_file_dict = {'train': 'train_raw.csv', 'dev': 'dev_raw.csv', 'test': 'test_raw.csv'}
top_file_dict = {'train': 'train_top10.csv', 'dev': 'dev_top10.csv', 'test': 'test_top10.csv'}

age_distribution = {'14-16': 98454, '24-26': 141104, '34-36': 30347, '44-46': 6510}

In [3]:
# HELPER FUNCTION

def read_data(file_name):
    if 'raw' in file_name:
        input_data = pd.read_csv(file_name, names=raw_type_list)
    else:
        input_data = pd.read_csv(file_name, names=top_type_list)
    return input_data

def read_id(file_name):
    f = open(file_name)
    lst = []

    for line in f.readlines():
        row = line.rstrip().split(',')
        lst.append(row[0])
    f.close()
    return lst

def preprocess(id_lst, X_lst, y_lst, num):
    X_dict = {x: [0 for i in range(num)] for x in id_lst}
    y_dict = {}
    for i in range(len(id_lst)):
        X_dict[id_lst[i]] += X_lst[i]
        y_dict[id_lst[i]] = y_lst[i]

    id_set = [k for k in X_dict.keys()]
    X = [v.tolist() for v in X_dict.values()]
    y = [v for v in y_dict.values()]

    return id_set, X, y

def convert_back(y_pred, id_lst, id_set):
    pred_dict = {}

    for i in range(len(id_set)):
        pred_dict[id_set[i]] = y_pred[i]

    y_new_pred = []
    for i in range(len(id_lst)):
        y_new_pred.append(pred_dict[id_lst[i]])

    return y_new_pred

In [4]:
# train data - get feature names
train_raw = raw_file_dict['train']
train_text = read_data(train_raw).iloc[:, -1]
train_text = train_text.replace('[^a-zA-Z ]', '', regex=True)

train_CV = CountVectorizer(lowercase=True)
train_table = train_CV.fit_transform(train_text)
train_feature_names = train_CV.get_feature_names()

In [5]:
# train data - read given 2 datasets
train_top = top_file_dict['train']
train_id = read_id(train_raw)
train_data = read_data(train_top).as_matrix()
y_train = train_data[:, -1]

In [6]:
# train data - select 500 best feature names
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest

selector = SelectKBest(score_func=chi2, k=1000)
selector.fit_transform(train_table, y=y_train)

mask = selector.get_support()
train_features = []
train_features_index = []

for bool, index in zip(mask, range(len(train_feature_names))):
    if bool:
        train_features.append(train_feature_names[index])
        train_features_index.append(index)

In [7]:
# convert into X_train format
train_word_data = []
for i in range(len(train_text)):
    if i % 10000 == 0: print(i)
    train_ins_data = train_table[i].toarray()[0]
    train_ins_word = [train_ins_data[j] for j in train_features_index]
    train_word_data.append(train_ins_word)

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000


In [8]:
# X_train data to numpy array
train_data_np = np.array(train_word_data)
train_data_np.shape

(276415, 1000)

In [9]:
# X_train numpy array to pandas DataFrame
train_data_df = pd.DataFrame(train_data_np, columns=train_features)

In [10]:
# dev data
dev_top = top_file_dict['dev']
dev_raw = raw_file_dict['dev']
dev_id = read_id(dev_raw)
dev_data = read_data(dev_top).as_matrix()
y_dev = dev_data[:, -1]

In [11]:
# dev data - get feature names
dev_text = read_data(dev_raw).iloc[:,-1]
dev_text = dev_text.replace('[^a-zA-Z ]', '', regex=True)

dev_CV = CountVectorizer(lowercase=True)
dev_table = dev_CV.fit_transform(dev_text)
dev_feature_names = dev_CV.get_feature_names()

In [12]:
# dev data - select same k features
dev_features = []
dev_features_index = []
for i in range(len(dev_feature_names)):
    if dev_feature_names[i] in train_features:
        dev_features_index.append(i)
        dev_features.append(dev_feature_names[i])

In [13]:
# convert into X_dev format
dev_word_data = []
for i in range(len(dev_text)):
    if i % 10000 == 0: print(i)
    dev_ins_data = dev_table[i].toarray()[0]
    dev_ins_word = [dev_ins_data[j] for j in dev_features_index]
    dev_word_data.append(dev_ins_word)

0
10000
20000
30000
40000


In [14]:
# X_dev data into numpy array
dev_data_np = np.array(dev_word_data)
dev_data_np.shape

(45332, 912)

In [15]:
# X_dev numpy array into pandas DataFrame
dev_data_df = pd.DataFrame(dev_data_np, columns=dev_features)

In [16]:
# fill the non-existing features with all zeros column
dev_missed_features = [x for x in train_features if x not in dev_features]
for miss in dev_missed_features:
    dev_data_df[miss] = [0 for i in range(len(dev_data_np))]
dev_data_df = dev_data_df.sort_index(axis=1)

In [17]:
X_train_ = train_data_df.as_matrix()
X_dev_ = dev_data_df.as_matrix()

In [22]:
# test data
test_top = top_file_dict['test']
test_raw = raw_file_dict['test']
test_id = read_id(test_raw)
test_data = read_data(test_top).as_matrix()
y_test = test_data[:, -1]
test_index = test_data[:, 0].tolist()

In [23]:
# test data - get feature names
test_text = read_data(test_raw).iloc[:,-1]
test_text = test_text.replace('[^a-zA-Z ]', '', regex=True)

test_CV = CountVectorizer(lowercase=True)
test_table = test_CV.fit_transform(test_text)
test_feature_names = test_CV.get_feature_names()

In [24]:
# test data - select same k features
test_features = []
test_features_index = []
for i in range(len(test_feature_names)):
    if test_feature_names[i] in train_features:
        test_features_index.append(i)
        test_features.append(test_feature_names[i])

In [25]:
# convert into X_test format
test_word_data = []
for i in range(len(test_text)):
    if i % 10000 ==0:
        print(i)
    test_ins_data = test_table[i].toarray()[0]
    test_ins_word = [test_ins_data[j] for j in test_features_index]
    test_word_data.append(test_ins_word)

0
10000
20000
30000
40000


In [26]:
# X_test data into numpy array
test_data_np = np.array(test_word_data)
test_data_np.shape

(43014, 909)

In [27]:
# X_test numpy array into pandas DataFrame
test_data_df = pd.DataFrame(test_data_np, columns=test_features)

In [28]:
# fill the non-existing features with all zeros column
test_missed_features = [x for x in train_features if x not in test_features]
for miss in test_missed_features:
    test_data_df[miss] = [0 for i in range(len(test_data_np))]
test_data_df = test_data_df.sort_index(axis=1)

In [29]:
X_test_ = test_data_df.as_matrix()

In [None]:
# DATA PREPROCESSING FINISHED

In [19]:
train_id_set, X_new_train, y_new_train = preprocess(train_id, X_train_, y_train, 1000)
dev_id_set, X_new_dev, y_new_dev = preprocess(dev_id, X_dev_, y_dev, 1000)

In [30]:
test_id_set, X_new_test, y_new_test = preprocess(test_id, X_test_, y_test, 1000)

In [20]:
# Decision Tree - Dev - 53.51%
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(max_depth=None)
dt.fit(X_new_train, y_new_train)
y_dev_pred = dt.predict(X_new_dev)
y_new_dev_pred = convert_back(y_dev_pred, dev_id, dev_id_set)
print("Decision Tree (Dev):", accuracy_score(y_dev, y_new_dev_pred))

Decision Tree (Dev): 0.49896320480014117


In [21]:
# Logistic Regression - Test - 61.46%
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(X_new_train, y_new_train)
y_dev_pred = clf.predict(X_new_dev)
y_new_dev_pred = convert_back(y_dev_pred, dev_id, dev_id_set)
print("Logistic Regression (Dev):", accuracy_score(y_dev, y_new_dev_pred))

Logistic Regression (Dev): 0.6177093443924822


In [31]:
# make prediction for test data
clf = LogisticRegression()
clf.fit(X_new_train, y_new_train)
y_test_pred = clf.predict(X_new_test)
y_new_test_pred = convert_back(y_test_pred, test_id, test_id_set)

In [32]:
# make output csv file - use logistic regression
lst = test_data[:, 0].tolist()
df = pd.DataFrame()
df['Id'] = lst
df['Prediction'] = y_new_test_pred
df.to_csv('out-1000.csv', sep=',', index=False)

In [None]:
# Random Forest - Dev - 57.83%
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=None, random_state=0)
clf.fit(X_new_train, y_new_train)
y_dev_pred = clf.predict(X_new_dev)
y_new_dev_pred = convert_back(y_dev_pred, dev_id, dev_id_set)
print("Random Forest (Dev):", accuracy_score(y_dev, y_new_dev_pred))

In [None]:
# Random Forest + Grid Search CV - Dev - 54.63%
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [200, 500, 700],
    'max_features': ['auto', 'sqrt', 'log2']
}

rfc = RandomForestClassifier(max_depth=None, random_state=0)
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=5)
CV_rfc.fit(X_new_train, y_new_train)
CV_rfc.best_params_

In [None]:
rfc1 = RandomForestClassifier(random_state=0, max_features='auto', n_estimators=700, max_depth=None)
rfc1.fit(X_new_train, y_new_train)
y_dev_pred = clf.predict(X_new_dev)
y_new_dev_pred = convert_back(y_dev_pred, dev_id, dev_id_set)
print("Grid Search (Dev):", accuracy_score(y_dev, y_new_dev_pred))

In [None]:
# Find instances with all zeros
count = 0
for i in range(len(X_new_train)):
    allzero = True
#     print(i, X_new_train[i])
    for j in X_new_train[i]:
        if j != 0:
            allzero = False
    if allzero:
        count += 1
#         print(i, X_new_train[i])
print(count / len(X_new_train))

In [None]:
# Linear SVC - Dev - 38.56%
from sklearn.svm import LinearSVC

clf = LinearSVC(random_state=0)
clf.fit(X_new_train, y_new_train)
y_dev_pred = clf.predict(X_new_dev)
y_new_dev_pred = convert_back(y_dev_pred, dev_id, dev_id_set)
print("Linear SVC (Dev):", accuracy_score(y_dev, y_new_dev_pred))

In [None]:
from sklearn.feature_extraction import DictVectorizer

file_name = top_file_dict['train']
data_train = read_data(file_name)
X_train = data_train.iloc[:, 1:-1]
y_train = data_train.iloc[:, -1]

file_name = top_file_dict['dev']
data_test = read_data(file_name)
X_test = data_test.iloc[:, 1:-1]
y_test = data_test.iloc[:, -1]

In [None]:
# Zero R - 38% / 51%
from sklearn.dummy import DummyClassifier
zero_r = DummyClassifier(strategy='most_frequent')
zero_r.fit(X_train, y_train)
print("Zero-R:", accuracy_score(zero_r.predict(X_dev), y_dev))
# cross_val_score(zero_r, X, y, cv=10)

In [None]:
# Decision Tree - 43.296% / 58%
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(max_depth=25)
dt.fit(X_train, y_train)
print("Decision Tree:", dt.score(X_dev, y_dev))

In [None]:
# Random Forest - 43.512% <21>
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=21, random_state=0)
clf.fit(X_train, y_train)
print("Random Forest:", clf.score(X_test, y_test))

In [None]:
# Extra Tree Classifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
clf = ExtraTreesClassifier()
clf.fit(X_train, y_train)
print("Extra Trees:", clf.score(X_test, y_test))

model = SelectFromModel(clf, prefit=True)
X_new_train = model.transform(X_train)
X_new_test = model.transform(X_test)
clf.fit(X_new_train, y_train)
y_pred = clf.predict(X_new_test)
print("Extra Trees:", accuracy_score(y_test, y_pred))

In [None]:
# MLP Classifier - 0.4352775081620048<25,2>
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(25, 2), random_state=1)
clf.fit(X_train, y_train)
print("MLP Classifier:", clf.score(X_test, y_test))

In [None]:
# Prediction for test data

file_name = top_file_dict['test']
data_train = read_raw_data(file_name)
index = data_train.iloc[:, 0]
X_pred = data_train.iloc[:, 1:-1]
y_pred = clf.predict(X_pred)

df = pd.DataFrame()
df['Id'] = index
df['Prediction'] = y_pred
df.to_csv('out.csv', sep=',', index=False)

In [None]:
# KNN - %
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_train, y_train)
print("KNN:", neigh.score(X_test, y_test))

In [None]:
# Linear SVC - %
from sklearn.svm import LinearSVC
clf = LinearSVC(random_state=0)
clf.fit(X_train, y_train)
print("LinearSVC:", clf.score(X_test, y_test))

In [None]:
# SVC - %
from sklearn.svm import SVC
clf = SVC()
clf.fit(X_train, y_train)
print("SVC:", clf.score(X_test, y_test))

In [None]:
# Logistic Regression - 42.63% / 57%
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_train, y_train)
print("Logistic:", clf.score(X_test, y_test))

In [None]:
# Multinomial NB - 43% / 58%
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train, y_train)
print("MultinomialNB:", clf.score(X_test, y_test))

In [None]:
# Gaussian NB - 41% / 56%
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train, y_train)
print("GaussianNB:", clf.score(X_test, y_test))

In [None]:
import re

def convert_digit(raw):
    if raw in range(14, 17): return 0
    elif raw in range(24, 27): return 1
    elif raw in range(34, 37): return 2
    elif raw in range(44, 47): return 3

data = pd.read_csv("train_raw.csv", names=raw_type_list)
X = []
y = []
for index, text in data.iterrows():
    if index > 1000:
        break
    nopunc = re.sub(r'[^a-zA-Z0-9 ]', '', text['Text'])
    X.append(len(set(nopunc.split())))
    y.append(convert_digit(text['Age']))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [None]:
from pattern.en import lemma

print(lemma('working'))

In [None]:
import matplotlib.pyplot as plt
from sklearn.svm import LinearSVC
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.naive_bayes import MultinomialNB

# Build a classification task using 3 informative features
# Create the RFE object and compute a cross-validated score.
clf = LinearSVC(C=1.0)
# The "accuracy" scoring is proportional to the number of correct
# classifications
rfecv = RFECV(estimator=clf, step=1, cv=StratifiedKFold(2),
              scoring='accuracy')
rfecv.fit(X_train, y_train)

print("Optimal number of features : %d" % rfecv.n_features_)

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()

In [None]:
data = read_raw_data('train_raw.csv')

X = []
X_length = []
for index, text in data.iterrows():
    user_id = text['User_ID']
#     print(user_id)
    nopunc = re.sub(r'[^a-zA-Z0-9 ]', '', text['Text'])
    X.append(nopunc.split())
    X_length.append(nopunc.split())

# print(sum(X_length)/len(X_length))

In [None]:
# Age Distribution in train data
count = {}
for k in ['14-16','24-26','34-36','44-46']:
    count[k]=0
total = 0
for k in y_train:
    count[k] += 1
for k in ['14-16','24-26','34-36','44-46']:
    total += count[k]

print(count)

In [None]:
dic = {word: {'14-16': 0, '24-26': 0, '34-36': 0, '44-46': 0} for word in top_word_list}
    
for word in top_word_list:
    for index, row in data_train.iterrows():
        if row[word] > 0:
            dic[word][y_train[index]] += 1

for word in top_word_list:
    _sum = 0
    for j in ['14-16','24-26','34-36','44-46']:
        dic[word][j] = total*dic[word][j]/count[j]
        _sum += dic[i][j]
    for j in ['14-16','24-26','34-36','44-46']:
        dic[word][j] = dic[word][j] / _sum

print(dic)

In [None]:
# Age distributions of top 30 words

word_dict = {word: {'14-16': 0, '24-26': 0, '34-36': 0, '44-46': 0} for word in top_word_list}
for index, row in data_train.iterrows():
    for word in top_word_list:
        if row[word] != 0:
            word_dict[word][row['Class']] += 1
            
word_dict = {'anyways': {'14-16': 4944, '24-26': 2441, '34-36': 168, '44-46': 13},
             'cuz': {'14-16': 4819, '24-26': 1381, '34-36': 233, '44-46': 8},
             'digest': {'14-16': 36, '24-26': 102, '34-36': 369, '44-46': 4},
             'diva': {'14-16': 31, '24-26': 125, '34-36': 453, '44-46': 0},
             'evermean': {'14-16': 0, '24-26': 0, '34-36': 105, '44-46': 0},
             'fox': {'14-16': 187, '24-26': 508, '34-36': 111, '44-46': 417},
             'gonna': {'14-16': 10597, '24-26': 4874, '34-36': 547, '44-46': 207},
             'greg': {'14-16': 151, '24-26': 464, '34-36': 112, '44-46': 317},
             'haha': {'14-16': 6416, '24-26': 1364, '34-36': 70, '44-46': 5},
             'jayel': {'14-16': 0, '24-26': 0, '34-36': 0, '44-46': 157},
             'kinda': {'14-16': 6598, '24-26': 3635, '34-36': 336, '44-46': 110},
             'levengals': {'14-16': 0, '24-26': 0, '34-36': 0, '44-46': 137},
             'literacy': {'14-16': 11, '24-26': 64, '34-36': 171, '44-46': 6},
             'lol': {'14-16': 8422, '24-26': 1557, '34-36': 408, '44-46': 444},
             'melissa': {'14-16': 250, '24-26': 275, '34-36': 21, '44-46': 225},
             'nan': {'14-16': 88, '24-26': 56, '34-36': 6, '44-46': 145},
             'nat': {'14-16': 195, '24-26': 78, '34-36': 2, '44-46': 204},
             'postcount': {'14-16': 0, '24-26': 356, '34-36': 389, '44-46': 0},
             'ppl': {'14-16': 3343, '24-26': 531, '34-36': 36, '44-46': 0},
             'rick': {'14-16': 125, '24-26': 350, '34-36': 148, '44-46': 519},
             'school': {'14-16': 13610, '24-26': 9365, '34-36': 1853, '44-46': 414},
             'shep': {'14-16': 3, '24-26': 0, '34-36': 2, '44-46': 190},
             'sherry': {'14-16': 11, '24-26': 48, '34-36': 13, '44-46': 180},
             'spanners': {'14-16': 0, '24-26': 1, '34-36': 99, '44-46': 0},
             'teri': {'14-16': 6, '24-26': 12, '34-36': 3, '44-46': 102},
             'u': {'14-16': 9139, '24-26': 2547, '34-36': 142, '44-46': 28},
             'ur': {'14-16': 2909, '24-26': 537, '34-36': 14, '44-46': 1},
             'urllink': {'14-16': 14835, '24-26': 46685, '34-36': 13404, '44-46': 2139},
             'wanna': {'14-16': 5750, '24-26': 2196, '34-36': 210, '44-46': 46},
             'work': {'14-16': 9719, '24-26': 26484, '34-36': 5452, '44-46': 1211}}

Plot the bar graph
for k, v in word_dict.items():
    print(v)
    plt.bar(range(len(v)), list(v.values()), align='center')
    plt.xticks(range(len(v)), list(v.keys()))
    plt.title(k)
    plt.show()
    
age_dict = {'14-16': {}, '24-26': {}, '34-36': {}, '44-46': {}}
new_age_dict = {'14-16': {}, '24-26': {}, '34-36': {}, '44-46': {}}

for k, v in word_dict.items():
    for ks, vs in age_dict.items():
        age_dict[ks][k] = word_dict[k][ks]

for k, v in age_dict.items():
    new_age_dict[k] = list(reversed(sorted(v.items(), key=operator.itemgetter(1))))

# Top 10 feature distributions (normalised)
'14-16': [('ppl', 0.8727789149847253), ('ur', 0.8698564164613034), ('haha', 0.8364584267053635), 
          ('u', 0.7744724558628764), ('cuz', 0.7236296537282069), ('anyways', 0.6691182379567854), 
          ('wanna', 0.6640323036842943), ('gonna', 0.5605997007713579), ('kinda', 0.5550158468463889), 
          ('lol', 0.47997133335565567)]
'24-26': [('work', 0.287832939550357), ('urllink', 0.2643041783638263), ('anyways', 0.230508118020287), 
          ('kinda', 0.2133494499066458), ('school', 0.20157074400041788), ('gonna', 0.17990759264086534), 
          ('wanna', 0.17694884758888418), ('postcount', 0.16445487302236883), ('u', 0.15060184225208706), 
          ('cuz', 0.14469287565639952)]
'34-36': [('evermean', 1.0), ('spanners', 0.9978323018665548), ('diva', 0.9255497682681122), 
          ('digest', 0.8771516331479886), ('postcount', 0.8355451269776312), ('literacy', 0.7912104527801802), 
          ('urllink', 0.35284555075250373), ('work', 0.2755093369403132), ('school', 0.18544652384760632), 
          ('cuz', 0.11350969350726677)]
'44-46': [('levengals', 1.0), ('jayel', 1.0), ('shep', 0.9967087407019541), ('teri', 0.9846137317830624), 
          ('sherry', 0.9691455026541897), ('nan', 0.937361638189683), ('nat', 0.9234049260898503), 
          ('rick', 0.9023549096467094), ('fox', 0.8749225398181598), ('melissa', 0.8696562791017377)]


In [None]:
# HELPER FUNCTION
def convert_class(raw):
    if raw == '14-16': return 0
    elif raw == '24-26': return 1
    elif raw == '34-36': return 2
    elif raw == '44-46': return 3

def convert_dict(file_name):
    X = []
    y = []
    f = open(file_name, 'r')
    for line in f:
        atts = line[:-1].split(",")
        this = {}
        count = 1
        for type in top_word_list:
            this[type] = int(atts[count])
            count += 1
        if atts[-1] != '?':
            X.append(this)
            y.append(convert_class(atts[-1]))
    f.close()
    return X, y