In [1]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
from matplotlib.colors import ListedColormap
import numpy as np
import csv
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [2]:
# renvoi un vecteur qui contient les noms des features (mais sans 'id' et sans 'match')
def get_features_names(fileName):
    features_names = np.array([],str)    
    with open(fileName, 'r') as f:
        reader = csv.DictReader(f, delimiter=',')
        row = next(reader)

        for col in row:     
            features_names = np.append(features_names,col)
            
        if features_names[0] == 'id':
            features_names = features_names[1:]
            
        if features_names[len(features_names)-1] == 'match':
            features_names = features_names[:-1]
            
    return features_names             

In [3]:
if False:   
    feat_train = get_features_names('../dataset/train.csv')
    feat_test = get_features_names('../dataset/test.csv')
    
    print(feat_train)
    print(feat_test)
    
    for c in range(len(feat_train)):
        if feat_train[c] != feat_test[c]:
            print("NON")

In [4]:
def read_train(fileName, features_names):
    data = []
    target = np.array([],float)
    genders = np.array([],str)
    races = np.array([],str)
    fields = np.array([],str)
    
    with open(fileName, 'r') as f:
        reader = csv.DictReader(f, delimiter=',')
        for row in reader:
            line = np.array([],float)
            for col in range(len(features_names)):
                try:
                    val = float(row[features_names[col]])
                    line = np.append(line,val)
                except ValueError:
                    val = str(row[features_names[col]])
                    val = val.lower() # tout en minuscule
                    val = val.strip() # on enlève les espaces inutiles

                    if  features_names[col] == 'gender':
                        if val not in genders:
                            genders = np.append(genders,val)

                        i, = np.where(genders == val)
                        val = float(i[0]) # classification; on remplace la valeur string du genre par 0 ou 1

                    if  features_names[col] == 'race' or features_names[col] == 'race_o':
                        if val not in races:
                            races = np.append(races,val)

                        i, = np.where(races == val)
                        val = float(i[0]) # classification; on remplace la valeur string du genre par un nombre

                    if  features_names[col] == 'field':
                        if val not in fields:
                            fields = np.append(fields,val)

                        i, = np.where(fields == val)
                        val = float(i[0]) # classification; on remplace la valeur string du genre par un nombre

                    # si le champ est vide alors on met -1000 (valeur assurée de ne pas exister normalement)
                    if type(val) is str:
                        val = float(-1000.0)  # on remplacera plus tard par une valeur acceptable (moyenne, medianne, etc ..)

                    line = np.append(line,val)

            data.append(line)
            target = np.append(target,row['match'])
                
    return np.array(data, float), target, features_names, genders, races, fields              

In [5]:
def read_test(fileName, features_names, genders, races, fields):
    data = []
    ids = []
    
    with open(fileName, 'r') as f:
        reader = csv.DictReader(f, delimiter=',')
        for row in reader:
            line = np.array([],float)
            for col in range(len(features_names)):
                try:
                    val = float(row[features_names[col]])
                    line = np.append(line,val)
                except ValueError:
                    val = str(row[features_names[col]])
                    val = val.lower() # tout en minuscule
                    val = val.strip() # on enlève les espaces inutiles

                    if  features_names[col] == 'gender':
                        if val not in genders:
                            genders = np.append(genders,val)

                        i, = np.where(genders == val)
                        val = float(i[0]) # classification; on remplace la valeur string du genre par 0 ou 1

                    if  features_names[col] == 'race' or features_names[col] == 'race_o':
                        if val not in races:
                            races = np.append(races,val)

                        i, = np.where(races == val)
                        val = float(i[0]) # classification; on remplace la valeur string du genre par un nombre

                    if  features_names[col] == 'field':
                        if val not in fields:
                            fields = np.append(fields,val)

                        i, = np.where(fields == val)
                        val = float(i[0]) # classification; on remplace la valeur string du genre par un nombre

                    # si le champ est vide alors on met -1000 (valeur assurée de ne pas exister normalement)
                    if type(val) is str:
                        val = float(-1000.0)  # on remplacera plus tard par une valeur acceptable (moyenne, medianne, etc ..)

                    line = np.append(line,val)

            data.append(line)
            ids.append(row['id'])
                
    return np.array(data, float), np.array(ids), features_names, genders, races, fields

In [26]:
features = get_features_names('../dataset/train.csv')
print(features)

['gender' 'age' 'age_o' 'race' 'race_o' 'importance_same_race'
 'importance_same_religion' 'field' 'attractive_important'
 'sincere_important' 'intellicence_important' 'funny_important'
 'ambtition_important' 'shared_interests_important' 'attractive' 'sincere'
 'intelligence' 'funny' 'ambition' 'attractive_partner' 'sincere_partner'
 'intelligence_partner' 'funny_partner' 'ambition_partner'
 'shared_interests_partner' 'pref_o_attractive' 'pref_o_sincere'
 'pref_o_intelligence' 'pref_o_funny' 'pref_o_ambitious'
 'pref_o_shared_interests' 'attractive_o' 'sinsere_o' 'intelligence_o'
 'funny_o' 'ambitous_o' 'shared_interests_o' 'sports' 'tvsports'
 'exercise' 'dining' 'museums' 'art' 'hiking' 'gaming' 'clubbing'
 'reading' 'tv' 'theater' 'movies' 'concerts' 'music' 'shopping' 'yoga'
 'interests_correlate' 'expected_happy_with_sd_people'
 'expected_num_interested_in_me' 'expected_num_matches' 'like'
 'guess_prob_liked' 'met']


In [25]:
X_all, y_all, features_names, genders, races, fields = read_train('../dataset/train.csv',features)
X_final_test, Ids, features_names_t, genders_t, races_t, fields_t = read_test('../dataset/test.csv',features,genders, races, fields)

print(genders)

if False:
    for r in X_final_test:
        print(r[7])
    
if False:
    if np.array_equal(genders, genders_t) is True:
        print("Gender equals")
    else:
        print("Gender not equals !!")
        
    if np.array_equal(races, races_t) is True:
        print("Race equals")
    else:
        print("Race not equals !!")
        
    if np.array_equal(fields, fields_t) is True:
        print("Field equals")
    else:
        print("Field not equals !!")
    
if False:
    nb_float = 0
    nb_not_float = 0
    for r in X_all:
        for c in r:
            if type(c) is str:
                nb_not_float += 1
            else:
                nb_float += 1


    print("nb float = %d" % nb_float)
    print("nb_not_float = %d" % nb_not_float)

    print(len(X_all[0]))
    print(len(features))
    print(X_all[0])
    print(y_all[0])
    print(Ids[0])
    

['female' 'male']


In [8]:
def tree_to_code(this_tree, feature_names):
    from sklearn.tree import _tree
    tree = this_tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree.feature
    ]
    print("def tree({}):".format(", ".join(feature_names)))

    def recurse(node, depth):
        indent = "  " * depth
        if tree.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree.threshold[node]
            print("{}if {} <= {}:".format(indent, name, threshold))
            recurse(tree.children_left[node], depth + 1)
            print("{}else:  # if {} > {}".format(indent, name, threshold))
            recurse(tree.children_right[node], depth + 1)
        else:
            print("{}return {}".format(indent, tree.value[node]))

    recurse(0, 1)

In [9]:
def get_features_min_max(tab, features):
    res = []
    mini = []
    maxi = []
    for c in range(len(features)):
        first = True
        for r in range(len(tab)):
            if tab[r][c] > -999:
                if first is True:
                    first = False
                    ma = tab[r][c]
                    mi = tab[r][c]
                    
                ma = max(ma,tab[r][c])
                mi = min(mi,tab[r][c])
                
        mini.append(mi)
        maxi.append(ma)
    
    res.append(features)
    res.append(mini)
    res.append(maxi)
    
    return res       

In [10]:
min_max = get_features_min_max(X_all,features)

if False:
    print(min_max[0])
    print(min_max[1])
    print(min_max[2])
    for c in range(len(min_max[0])):
        print("%s : %d/%d" % (min_max[0][c],min_max[1][c],min_max[2][c]))

In [11]:
def fill_empty(tab, min_max):
    for row in range(len(tab)):
        for col in range(len(min_max[0])):
            if tab[row][col] <= float(-999):
                tab[row][col] = (min_max[1][col] + min_max[2][col]) / 2

In [45]:
fill_empty(X_all,min_max)
fill_empty(X_final_test,min_max)

if False:
    for r in range(10):
        print(X_all[r])
        
    print("----------------------")
    
    for r in range(10):
        print(X_final_test[r])


In [13]:
# on ajoute des features par combinaison entre elles
def add_features(data_train, data_test, features):
    # 01 diff_age                  (if gender = female(0) alors positif, sinon négatif (hommes préfèrent femme moins agées))
    # 02 same_race_factor          (if meme race alors 0 sinon importance_same_race)
    # 03 attractive_factor         (plus les attractives rate sont proche mieux c'est (grosse différence = peu de chances de match))
    # 04 sincere_factor            (on multiplie le facteur sincère de l'autre par le sincere_important)
    # 05 intellicence_factor       (idem)
    # 06 funny_factor              (idem)
    # 07 ambtition_factor          (idem)
    # 08 shared_interests_factor   (idem)
    # 09 attractive_factor_o       (plus les attractives rate sont proche mieux c'est (grosse différence = peu de chances de match))
    # 10 sincere_factor_o          (on multiplie le facteur sincère de l'autre par le sincere_important)
    # 11 intellicence_factor_o     (idem)
    # 12 funny_factor_o            (idem)
    # 13 ambtition_factor_o        (idem)
    # 14 shared_interests_factor_o (idem)
    
    # 15 interests_correlate_factor (re-scaling de 0 a 10 pour qu'il aie la même importance que les autres facteurs)
    
    
    new_col = np.zeros(len(data[0]))
    
        
    

In [246]:
# on enlève les features inutiles qui ne font que rajouter du bruit
def filter_features(data_train, data_test, features):
    
    # 01 importance_same_religion     pas de feature religion donc c'est du bruit et on enlève
    # LIKE
    train = np.take(data_train, 58, 1)
    test = np.take(data_test, 58, 1)
    f = np.take(features, 58, 0)
    
    # MET
    train2 = np.take(data_train, 60, 1)
    test2 = np.take(data_test, 60, 1)
    f2 = np.take(features, 60, 0)
    
    # ATTRACTIVE
    fnum3_1 = 9
    fnum3_2 = 20
    train3_1 = np.take(data_train, fnum3_1, 1)
    test3_1 = np.take(data_test, fnum3_1, 1)
    train3_2 = np.take(data_train, fnum3_2, 1)
    test3_2 = np.take(data_test, fnum3_2, 1)
    f3 = features[fnum3_1]+" with "+features[fnum3_2]
    train3 = train3_1*train3_2
    test3 = test3_1*test3_2
    
    # AGE
    fnum4_1 = 1
    fnum4_2 = 2
    train4_1 = np.take(data_train, fnum4_1, 1)
    test4_1 = np.take(data_test, fnum4_1, 1)
    train4_2 = np.take(data_train, fnum4_2, 1)
    test4_2 = np.take(data_test, fnum4_2, 1)
    f4 = features[fnum4_1]+" with "+features[fnum4_2]
    train4 = train4_1-train4_2
    test4 = test4_1-test4_2
    
    train7 = np.take(data_train, fnum4_1, 1)
    test7 = np.take(data_test, fnum4_1, 1)
    f7 = np.take(features, fnum4_1, 0)
    
    train8 = np.take(data_train, fnum4_2, 1)
    test8 = np.take(data_test, fnum4_2, 1)
    f8 = np.take(features, fnum4_2, 0)
    
    # RACE
    fnum5_1 = 3
    fnum5_2 = 4
    fnum5_3 = 5
    train5_1 = np.take(data_train, fnum5_1, 1)
    test5_1 = np.take(data_test, fnum5_1, 1)
    train5_2 = np.take(data_train, fnum5_2, 1)
    test5_2 = np.take(data_test, fnum5_2, 1)
    train5_3 = np.take(data_train, fnum5_3, 1)
    test5_3 = np.take(data_test, fnum5_3, 1)
    train5 = np.logical_xor(train5_1, train5_2)*train5_3*10
    test5 = np.logical_xor(test5_1, test5_2)*test5_3*10
    f5 = features[fnum5_1]+" with "+features[fnum5_2]+" with "+features[fnum5_3]
    
    # INTELLIGENCE
    fnum6_1 = 10
    fnum6_2 = 21
    train6_1 = np.take(data_train, fnum6_1, 1)
    test6_1 = np.take(data_test, fnum6_1, 1)
    train6_2 = np.take(data_train, fnum6_2, 1)
    test6_2 = np.take(data_test, fnum6_2, 1)
    f6 = features[fnum6_1]+" with "+features[fnum6_2]
    train6 = train6_1*train6_2
    test6 = test6_1*test6_2
    
    # INTEREST
    fnum9_1 = 13
    fnum9_2 = 24
    train9_1 = np.take(data_train, fnum9_1, 1)
    test9_1 = np.take(data_test, fnum9_1, 1)
    train9_2 = np.take(data_train, fnum9_2, 1)
    test9_2 = np.take(data_test, fnum9_2, 1)
    f9 = features[fnum9_1]+" with "+features[fnum9_2]
    train9 = train9_1*train9_2
    test9 = test9_1*test9_2
    
    train10 = np.take(data_train, fnum9_1, 1)
    test10 = np.take(data_test, fnum9_1, 1)
    f10 = np.take(features, fnum9_1, 0)
    
    train11 = np.take(data_train, fnum9_2, 1)
    test11 = np.take(data_test, fnum9_2, 1)
    f11 = np.take(features, fnum9_2, 0)
    
    tr = np.zeros((len(train),20))
    te = np.zeros((len(test),20))
    
    for r in range(len(train)):
        tr[r][0] = train[r] # like
        tr[r][1] = train2[r] # met
        #tr[r][2] = train3[r] # attractive_combine
        tr[r][3] = train4[r] # age_combin
        tr[r][4] = train5[r] # race_combin
        #tr[r][5] = train6[r] # intelligence_combin
        tr[r][6] = train7[r] # age
        tr[r][7] = train8[r] # age_o
        tr[r][8] = train9[r] # interest_combin
        tr[r][9] = train10[r] # interest_imp
        tr[r][10] = train11[r] # interest_o
        
        
    for r in range(len(test)):
        te[r][0] = test[r] # like
        te[r][1] = test2[r] # met
        #te[r][2] = test3[r] # attractive_combine
        te[r][3] = test4[r] # age_combin
        te[r][4] = test5[r] # race_combin
        #te[r][5] = test6[r] # intelligence_combin
        te[r][6] = test7[r] # age
        te[r][7] = test8[r] # age_o
        te[r][8] = test9[r] # interest_combin
        te[r][9] = test10[r] # interest_imp
        te[r][10] = test11[r] # interest_o
        
    
    f = [f, f2, f4, f5, f7, f8, f9, f10, f11]
    
    return tr, te, f

tr,te,f = filter_features(X_all, X_final_test, features)

if False:
    print(tr)
    print(te)
    print(f)


In [247]:
# Split datas in 2 parts (80% for trainning/validation and 20% for test)
#X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42, shuffle=True, stratify=y_all)
X_train, X_test, y_train, y_test = train_test_split(tr, y_all, test_size=0.2, random_state=42, shuffle=True, stratify=y_all)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

from collections import Counter
z = [0, 1]
Counter(y_test)


(1596,)


Counter({'0': 1360, '1': 236})

In [248]:
clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_train, y_train, sample_weight=None, check_input=True, X_idx_sorted=None)

if False:
    tree_to_code(clf, features_names)

In [249]:
if False:
    print(X_test)
    print(y_test)

In [250]:
y_pred = clf.predict(X_test)
clf.score(X_test, y_test, sample_weight=None)    

0.7807017543859649

In [251]:
print(len(X_final_test))
print(len(Ids))

400
400


In [252]:
# pour la prediction fianle on entraine avec toutes les données
clf.fit(X_all, y_all, sample_weight=None, check_input=True, X_idx_sorted=None)
prediction = clf.predict(X_final_test)

In [253]:
print(len(prediction))
print(len(Ids))

400
400


In [254]:
with open('Sueur_Fuchs_Pont_06.csv', 'w', newline='') as csvfile:
    fieldnames = ['id', 'match']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    for r in range(len(prediction)):
        writer.writerow({'id': Ids[r], 'match': prediction[r]})