In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import precision_recall_curve,f1_score
SEED = 100

In [2]:
#read data
train = pd.read_csv("/Users/liu/Desktop/train.csv")
test = pd.read_csv("/Users/liu/Desktop/test.csv")

train = train.head(100000) #restricted to PC capabilities, only was able to run part of the trainset
#train = train.head(10000)
#test = test.head(10000)

In [3]:
len(train)

100000

In [4]:
# discovery of the data
len(test[test['keywords'].isnull()])

test_null_id = test['ID'][test['keywords'].isnull()].tolist()
# 363042
len(test_null_id)
test = test[test['keywords'].notnull()]

In [5]:
len(test)

2748743

In [6]:
len(train)

100000

In [7]:
len(test_null_id)

363042

In [8]:
#frequencies for each gender
train['sex'].value_counts()

M    55068
F    44932
Name: sex, dtype: int64

In [9]:
#concatenate train and test set to simplify data preperation
data = pd.concat([train,test],ignore_index=True)
del train,test
data.shape

(2848743, 4)

In [10]:
#build a function to creat a dictionary for keywords
def create_dict(x):

    dict_keyword = {}
    # seperate by ";"
    split_arr = str(x).split(";")
#     print("keywords",x)
    # get every word and its numeric value 
    for arr_str in split_arr:
#         print(arr_str)
        arr_str = str(arr_str)
        if ":" in arr_str:
            key_value = arr_str.split(":")
            dict_keyword[key_value[0]] = key_value[1]
    return dict_keyword

In [11]:
#create dictionary for each row
data['keyword_dict'] = data['keywords'].map(lambda x:create_dict(x))
data.head()

Unnamed: 0,ID,keywords,age,sex,keyword_dict
0,1,fibre:16;quoi:1;dangers:1;combien:1;hightech:1...,62.0,F,"{'fibre': '16', 'quoi': '1', 'dangers': '1', '..."
1,2,restaurant:1;marrakech.shtml:1,35.0,M,"{'restaurant': '1', 'marrakech.shtml': '1'}"
2,3,payer:1;faq:1;taxe:1;habitation:1;macron:1;qui...,45.0,F,"{'payer': '1', 'faq': '1', 'taxe': '1', 'habit..."
3,4,rigaud:3;laurent:3;photo:11;profile:8;photopro...,46.0,F,"{'rigaud': '3', 'laurent': '3', 'photo': '11',..."
4,5,societe:1;disparition:1;proche:1;m%c3%a9lanie....,42.0,F,"{'societe': '1', 'disparition': '1', 'proche':..."


In [12]:
#get all the keywords that appeared
data['keys'] = data['keyword_dict'].map(lambda x:list(x.keys()))
data.head()

Unnamed: 0,ID,keywords,age,sex,keyword_dict,keys
0,1,fibre:16;quoi:1;dangers:1;combien:1;hightech:1...,62.0,F,"{'fibre': '16', 'quoi': '1', 'dangers': '1', '...","[fibre, quoi, dangers, combien, hightech, que,..."
1,2,restaurant:1;marrakech.shtml:1,35.0,M,"{'restaurant': '1', 'marrakech.shtml': '1'}","[restaurant, marrakech.shtml]"
2,3,payer:1;faq:1;taxe:1;habitation:1;macron:1;qui...,45.0,F,"{'payer': '1', 'faq': '1', 'taxe': '1', 'habit...","[payer, faq, taxe, habitation, macron, qui, de..."
3,4,rigaud:3;laurent:3;photo:11;profile:8;photopro...,46.0,F,"{'rigaud': '3', 'laurent': '3', 'photo': '11',...","[rigaud, laurent, photo, profile, photoprofile..."
4,5,societe:1;disparition:1;proche:1;m%c3%a9lanie....,42.0,F,"{'societe': '1', 'disparition': '1', 'proche':...","[societe, disparition, proche, m%c3%a9lanie.go..."


In [13]:
# create a function for delayering
def flat(l: list):
    for _ in l:
        if isinstance(_, list):
            yield from flat(_)
        else:
            yield _

# get all the lists of keys 
lst = data['keys'].tolist()
# delayer the lists
lst = [_ for _ in flat(lst)]


In [14]:
#transfer to dataframe
key_count = pd.DataFrame()
key_count['key'] = lst

In [15]:
# get the frequencies for each word
word_counts = pd.DataFrame(key_count['key'].value_counts()).reset_index()
word_counts.columns = ['word','count']
word_counts.to_csv("word_counts.csv",index=False)
word_counts.head()

Unnamed: 0,word,count
0,les,652547
1,des,457913
2,france,392502
3,sur,327798
4,forum,323008


In [16]:
# use the top 100 words with highest frequencies for features
save_word_top = word_counts['word'].tolist()[:100]


In [17]:
# get the numeric value of the word, replace with 0 if non
def get_word_value(word_dict,word):
    return word_dict.get(word,0)

# literate for 100 top words 
for word in save_word_top:
    data[word] = data['keyword_dict'].map(lambda x:get_word_value(x,word))
data[save_word_top] = data[save_word_top].astype(int)
data.head()

Unnamed: 0,ID,keywords,age,sex,keyword_dict,keys,les,des,france,sur,...,maison,week,ces,detail,sciences,sport,trump,femmes,photo,reforme
0,1,fibre:16;quoi:1;dangers:1;combien:1;hightech:1...,62.0,F,"{'fibre': '16', 'quoi': '1', 'dangers': '1', '...","[fibre, quoi, dangers, combien, hightech, que,...",2,1,0,16,...,0,0,0,0,0,0,0,0,0,0
1,2,restaurant:1;marrakech.shtml:1,35.0,M,"{'restaurant': '1', 'marrakech.shtml': '1'}","[restaurant, marrakech.shtml]",0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,payer:1;faq:1;taxe:1;habitation:1;macron:1;qui...,45.0,F,"{'payer': '1', 'faq': '1', 'taxe': '1', 'habit...","[payer, faq, taxe, habitation, macron, qui, de...",0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,4,rigaud:3;laurent:3;photo:11;profile:8;photopro...,46.0,F,"{'rigaud': '3', 'laurent': '3', 'photo': '11',...","[rigaud, laurent, photo, profile, photoprofile...",0,0,0,0,...,0,0,0,0,0,0,0,0,11,0
4,5,societe:1;disparition:1;proche:1;m%c3%a9lanie....,42.0,F,"{'societe': '1', 'disparition': '1', 'proche':...","[societe, disparition, proche, m%c3%a9lanie.go...",1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
# or_cols = ['ID','age','sex']
# new_cols = or_cols + save_word_top

# data = data[new_cols]
# data.head()
# delete "keywords",'keyword_dict','keys'; transfer gender to nums
sex_map = {'F':0,'M':1}
data = data.drop(['keywords','keyword_dict','keys'],axis=1)
data['sex'] = data['sex'].map(sex_map)
data.head()


Unnamed: 0,ID,age,sex,les,des,france,sur,forum,affich,pour,...,maison,week,ces,detail,sciences,sport,trump,femmes,photo,reforme
0,1,62.0,0.0,2,1,0,16,0,0,3,...,0,0,0,0,0,0,0,0,0,0
1,2,35.0,1.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,45.0,0.0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,4,46.0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,11,0
4,5,42.0,0.0,1,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
# select the entries without null values, seperate for age and sex
train_x = data[save_word_top][data['age'].notnull()]
train_age = data['age'][data['age'].notnull()]
train_sex = data['sex'][data['sex'].notnull()]


test_x = data[save_word_top][data['age'].isnull()]
test_id = data['ID'][data['age'].isnull()]




In [20]:

from sklearn.model_selection import train_test_split

In [21]:
#seperate trainset and testset(here called val)
train_x,val_x = train_test_split(train_x,test_size=0.2,random_state=SEED)
train_age,val_age = train_test_split(train_age,test_size=0.2,random_state=SEED)
train_sex,val_sex = train_test_split(train_sex,test_size=0.2,random_state=SEED)



In [22]:
#predict the ages
import lightgbm as lgb
params = {
          'max_depth': 20,
          'learning_rate': 0.001,
          "boosting": "gbdt",
          "bagging_seed": 11,
          "metric": 'mse',
          "verbosity": -1,
          'gpu_platform_id': 1,
          'gpu_device_id': 1
         }
trn_data = lgb.Dataset(train_x, label=train_age)
val_data = lgb.Dataset(val_x, label=val_age)
# test_data = lgb.Dataset(X_test)
clf = lgb.train(params, trn_data, 5000, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds=500)
pre_test_age = clf.predict(test_x)
'done'

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


Training until validation scores don't improve for 500 rounds
[1000]	training's l2: 162.477	valid_1's l2: 161.724
[2000]	training's l2: 160.317	valid_1's l2: 160.196
[3000]	training's l2: 159.185	valid_1's l2: 159.721
[4000]	training's l2: 158.403	valid_1's l2: 159.571
[5000]	training's l2: 157.787	valid_1's l2: 159.539
Did not meet early stopping. Best iteration is:
[5000]	training's l2: 157.787	valid_1's l2: 159.539


'done'

In [23]:
#predict the sex
import lightgbm as lgb
params = {
          'max_depth': 20,
          'learning_rate': 0.001,
          'objective': 'binary', #defined objective function
          "boosting": "gbdt",
          "bagging_seed": 11,
          "metric": {'binary_logloss','auc'},
          "verbosity": -1,
          'gpu_platform_id': 1,
          'gpu_device_id': 1
         }
trn_data = lgb.Dataset(train_x, label=train_sex)
val_data = lgb.Dataset(val_x, label=val_sex)



# test_data = lgb.Dataset(X_test)
clf = lgb.train(params, trn_data, 5000, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds=500)
pre_test_sex = clf.predict(test_x)
pre_val_sex = clf.predict(val_x)

val_realNpre = pd.DataFrame()
val_realNpre['real'] = val_sex
val_realNpre['pre'] = pre_val_sex



#get recall and precision for each threshold
precision, recall, thresholds = precision_recall_curve(val_realNpre['real'],val_realNpre['pre'])

thresholds = thresholds.tolist()
#add an element - infinity  to threshold because threshold has one less element than precision
thresholds.append(np.inf)

#create 3 columns: precision,recall,thresholds
prc = pd.DataFrame()
prc['precision'] = precision
prc['recall'] = recall
prc['thresholds'] = thresholds
#calculate F1-score
def create_f1(df):
    return 2*df['precision'] * df['recall']/(df['precision'] + df['recall'])


prc['f1_score'] = prc.apply(create_f1,axis=1)
#order F1-score in descending
prc = prc.sort_values(by=['f1_score'],ascending=False)
prc.head()


# 'done'


Training until validation scores don't improve for 500 rounds
[1000]	training's binary_logloss: 0.656696	training's auc: 0.642957	valid_1's binary_logloss: 0.657259	valid_1's auc: 0.632868
[2000]	training's binary_logloss: 0.648704	training's auc: 0.650992	valid_1's binary_logloss: 0.651109	valid_1's auc: 0.637407
[3000]	training's binary_logloss: 0.645073	training's auc: 0.655493	valid_1's binary_logloss: 0.6494	valid_1's auc: 0.637928
Early stopping, best iteration is:
[2505]	training's binary_logloss: 0.646681	training's auc: 0.653644	valid_1's binary_logloss: 0.650017	valid_1's auc: 0.638263


Unnamed: 0,precision,recall,thresholds,f1_score
640,0.589025,0.966277,0.391936,0.731898
642,0.589035,0.966187,0.392734,0.73188
639,0.588993,0.966277,0.391565,0.731873
636,0.588951,0.966367,0.390467,0.731867
643,0.589045,0.966097,0.39314,0.731862


In [24]:
#get the optimized threshold through F1-score
pre_limit = prc['thresholds'].tolist()[0]

In [25]:
# len(thresholds.tolist())

In [26]:
sex_re_map = {0:'F',1:'M'}

sub = pd.DataFrame()
sub['ID'] = test_id
sub['age_pred'] = pre_test_age
sub['sex_pred'] = pre_test_sex
sub['age_pred'] = sub['age_pred'].astype(int)
sub['sex_pred'] = (sub['sex_pred'] > pre_limit).astype(int)
sub['sex_pred'] = sub['sex_pred'].map(sex_re_map)
sub.head()

Unnamed: 0,ID,age_pred,sex_pred
100000,2,45,M
100001,3,55,M
100002,4,46,M
100003,5,45,M
100004,6,45,M


In [27]:
# sub.to_csv("submission.csv",index=False)
test_notmsg_sub = pd.DataFrame()
test_notmsg_sub['ID'] = test_null_id
test_notmsg_sub['age_pred'] = 40
test_notmsg_sub['sex_pred'] = "M"
test_notmsg_sub.head()

Unnamed: 0,ID,age_pred,sex_pred
0,1,40,M
1,8,40,M
2,14,40,M
3,17,40,M
4,18,40,M


In [28]:
sub_all = pd.concat([sub,test_notmsg_sub],ignore_index=True)
sub_all = sub_all.sort_values(by=['ID'])
sub_all.head()

Unnamed: 0,ID,age_pred,sex_pred
2748743,1,40,M
0,2,45,M
1,3,55,M
2,4,46,M
3,5,45,M


In [29]:
sub_all.to_csv("/Users/liu/Desktop/ESCP Python/python final/submission.csv",index=False)

In [30]:
sub_all['sex_pred'].value_counts()

M    2890665
F     221120
Name: sex_pred, dtype: int64

In [31]:
??precision_recall_curve