In [1]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
from matplotlib.colors import ListedColormap
import numpy as np
import csv
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [2]:
# renvoi un vecteur qui contient les noms des features (mais sans 'id' et sans 'match')
def get_features_names(fileName):
    features_names = np.array([],str)    
    with open(fileName, 'r') as f:
        reader = csv.DictReader(f, delimiter=',')
        row = next(reader)

        for col in row:     
            features_names = np.append(features_names,col)
            
        if features_names[0] == 'id':
            features_names = features_names[1:]
            
        if features_names[len(features_names)-1] == 'match':
            features_names = features_names[:-1]
            
    return features_names             

In [3]:
if False:   
    feat_train = get_features_names('../dataset/train.csv')
    feat_test = get_features_names('../dataset/test.csv')
    
    print(feat_train)
    print(feat_test)
    
    for c in range(len(feat_train)):
        if feat_train[c] != feat_test[c]:
            print("NON")

In [4]:
def read_train(fileName, features_names):
    data = []
    target = np.array([],float)
    genders = np.array([],str)
    races = np.array([],str)
    fields = np.array([],str)
    
    with open(fileName, 'r') as f:
        reader = csv.DictReader(f, delimiter=',')
        for row in reader:
            line = np.array([],float)
            for col in range(len(features_names)):
                try:
                    val = float(row[features_names[col]])
                    line = np.append(line,val)
                except ValueError:
                    val = str(row[features_names[col]])
                    val = val.lower() # tout en minuscule
                    val = val.strip() # on enlève les espaces inutiles

                    if  features_names[col] == 'gender':
                        if val not in genders:
                            genders = np.append(genders,val)

                        i, = np.where(genders == val)
                        val = float(i[0]) # classification; on remplace la valeur string du genre par 0 ou 1

                    if  features_names[col] == 'race' or features_names[col] == 'race_o':
                        if val not in races:
                            races = np.append(races,val)

                        i, = np.where(races == val)
                        val = float(i[0]) # classification; on remplace la valeur string du genre par un nombre

                    if  features_names[col] == 'field':
                        if val not in fields:
                            fields = np.append(fields,val)

                        i, = np.where(fields == val)
                        val = float(i[0]) # classification; on remplace la valeur string du genre par un nombre

                    # si le champ est vide alors on met -1000 (valeur assurée de ne pas exister normalement)
                    if type(val) is str:
                        val = float(-1000.0)  # on remplacera plus tard par une valeur acceptable (moyenne, medianne, etc ..)

                    line = np.append(line,val)

            data.append(line)
            target = np.append(target,row['match'])
                
    return np.array(data, float), target, features_names, genders, races, fields              

In [5]:
def read_test(fileName, features_names, genders, races, fields):
    data = []
    ids = []
    
    with open(fileName, 'r') as f:
        reader = csv.DictReader(f, delimiter=',')
        for row in reader:
            line = np.array([],float)
            for col in range(len(features_names)):
                try:
                    val = float(row[features_names[col]])
                    line = np.append(line,val)
                except ValueError:
                    val = str(row[features_names[col]])
                    val = val.lower() # tout en minuscule
                    val = val.strip() # on enlève les espaces inutiles

                    if  features_names[col] == 'gender':
                        if val not in genders:
                            genders = np.append(genders,val)

                        i, = np.where(genders == val)
                        val = float(i[0]) # classification; on remplace la valeur string du genre par 0 ou 1

                    if  features_names[col] == 'race' or features_names[col] == 'race_o':
                        if val not in races:
                            races = np.append(races,val)

                        i, = np.where(races == val)
                        val = float(i[0]) # classification; on remplace la valeur string du genre par un nombre

                    if  features_names[col] == 'field':
                        if val not in fields:
                            fields = np.append(fields,val)

                        i, = np.where(fields == val)
                        val = float(i[0]) # classification; on remplace la valeur string du genre par un nombre

                    # si le champ est vide alors on met -1000 (valeur assurée de ne pas exister normalement)
                    if type(val) is str:
                        val = float(-1000.0)  # on remplacera plus tard par une valeur acceptable (moyenne, medianne, etc ..)

                    line = np.append(line,val)

            data.append(line)
            ids.append(row['id'])
                
    return np.array(data, float), np.array(ids), features_names, genders, races, fields

In [6]:
features = get_features_names('../dataset/train.csv')

In [7]:
X_all, y_all, features_names, genders, races, fields = read_train('../dataset/train.csv',features)
X_final_test, Ids, features_names_t, genders_t, races_t, fields_t = read_test('../dataset/test.csv',features,genders, races, fields)

print(genders)

if False:
    for r in X_final_test:
        print(r[7])
    
if False:
    if np.array_equal(genders, genders_t) is True:
        print("Gender equals")
    else:
        print("Gender not equals !!")
        
    if np.array_equal(races, races_t) is True:
        print("Race equals")
    else:
        print("Race not equals !!")
        
    if np.array_equal(fields, fields_t) is True:
        print("Field equals")
    else:
        print("Field not equals !!")
    
if False:
    nb_float = 0
    nb_not_float = 0
    for r in X_all:
        for c in r:
            if type(c) is str:
                nb_not_float += 1
            else:
                nb_float += 1


    print("nb float = %d" % nb_float)
    print("nb_not_float = %d" % nb_not_float)

    print(len(X_all[0]))
    print(len(features))
    print(X_all[0])
    print(y_all[0])
    print(Ids[0])
    

['female' 'male']


In [8]:
def tree_to_code(this_tree, feature_names):
    from sklearn.tree import _tree
    tree = this_tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree.feature
    ]
    print("def tree({}):".format(", ".join(feature_names)))

    def recurse(node, depth):
        indent = "  " * depth
        if tree.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree.threshold[node]
            print("{}if {} <= {}:".format(indent, name, threshold))
            recurse(tree.children_left[node], depth + 1)
            print("{}else:  # if {} > {}".format(indent, name, threshold))
            recurse(tree.children_right[node], depth + 1)
        else:
            print("{}return {}".format(indent, tree.value[node]))

    recurse(0, 1)

In [9]:
def get_features_min_max(tab, features):
    res = []
    mini = []
    maxi = []
    for c in range(len(features)):
        first = True
        for r in range(len(tab)):
            if tab[r][c] > -999:
                if first is True:
                    first = False
                    ma = tab[r][c]
                    mi = tab[r][c]
                    
                ma = max(ma,tab[r][c])
                mi = min(mi,tab[r][c])
                
        mini.append(mi)
        maxi.append(ma)
    
    res.append(features)
    res.append(mini)
    res.append(maxi)
    
    return res       

In [10]:
min_max = get_features_min_max(X_all,features)

if False:
    print(min_max[0])
    print(min_max[1])
    print(min_max[2])
    for c in range(len(min_max[0])):
        print("%s : %d/%d" % (min_max[0][c],min_max[1][c],min_max[2][c]))

In [11]:
def fill_empty(tab, min_max):
    for row in range(len(tab)):
        for col in range(len(min_max[0])):
            if tab[row][col] <= float(-999):
                tab[row][col] = (min_max[1][col] + min_max[2][col]) / 2

In [12]:
fill_empty(X_all,min_max)
fill_empty(X_final_test,min_max)

if False:
    for r in range(10):
        print(X_all[r])
        
    print("----------------------")
    
    for r in range(10):
        print(X_final_test[r])


In [None]:
# on ajoute des features par combinaison entre elles
def add_features(data_train, data_test, features):
    # 01 diff_age                  (if gender = female(0) alors positif, sinon négatif (hommes préfèrent femme moins agées))
    # 02 same_race_factor          (if meme race alors 0 sinon importance_same_race)
    # 03 attractive_factor         (plus les attractives rate sont proche mieux c'est (grosse différence = peu de chances de match))
    # 04 sincere_factor            (on multiplie le facteur sincère de l'autre par le sincere_important)
    # 05 intellicence_factor       (idem)
    # 06 funny_factor              (idem)
    # 07 ambtition_factor          (idem)
    # 08 shared_interests_factor   (idem)
    # 09 attractive_factor_o       (plus les attractives rate sont proche mieux c'est (grosse différence = peu de chances de match))
    # 10 sincere_factor_o          (on multiplie le facteur sincère de l'autre par le sincere_important)
    # 11 intellicence_factor_o     (idem)
    # 12 funny_factor_o            (idem)
    # 13 ambtition_factor_o        (idem)
    # 14 shared_interests_factor_o (idem)
    
    # 15 interests_correlate_factor (re-scaling de 0 a 10 pour qu'il aie la même importance que les autres facteurs)
    
    
    new_col = np.zeros(len(data[0]))
    
        
    

In [44]:
# on enlève les features inutiles qui ne font que rajouter du bruit
def filter_features(data_train, data_test, features):
    # 01 importance_same_religion     pas de feature religion donc c'est du bruit et on enlève
    train = np.take(data_train, 58, 1)
    test = np.take(data_test, 58, 1)
    f = np.take(features, 58, 0)
    train2 = np.take(data_train, 60, 1)
    test2 = np.take(data_test, 60, 1)
    f2 = np.take(features, 60, 0)   
    
    tr = np.zeros((len(train),2))
    te = np.zeros((len(test),2))
    
    for r in range(len(train)):
        tr[r][0] = train[r]
        tr[r][1] = train2[r]
        
    for r in range(len(test)):
        te[r][0] = test[r]
        te[r][1] = test2[r]
    
    f = [f,f2]
    
    return tr, te, f

tr,te,f = filter_features(X_all, X_final_test, features)

if False:
    print(tr)
    print(te)
    print(f)


In [46]:
# Split datas in 2 parts (80% for trainning/validation and 20% for test)
#X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42, shuffle=True, stratify=y_all)
X_train, X_test, y_train, y_test = train_test_split(tr, y_all, test_size=0.2, random_state=42, shuffle=True, stratify=y_all)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)


(6382, 2)
(6382,)
(1596, 2)
(1596,)


In [47]:
clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_train, y_train, sample_weight=None, check_input=True, X_idx_sorted=None)

if False:
    tree_to_code(clf, features_names)

In [48]:
if False:
    print(X_test)
    print(y_test)

In [49]:
y_pred = clf.predict(X_test)
clf.score(X_test, y_test, sample_weight=None)    

0.85150375939849621

In [50]:
print(len(X_final_test))
print(len(Ids))

400
400


In [51]:
# pour la prediction fianle on entraine avec toutes les données
clf.fit(X_all, y_all, sample_weight=None, check_input=True, X_idx_sorted=None)
prediction = clf.predict(X_final_test)

In [52]:
print(len(prediction))
print(len(Ids))

400
400


In [53]:
with open('Sueur_Fuchs_Pont_5.csv', 'w', newline='') as csvfile:
    fieldnames = ['id', 'match']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    for r in range(len(prediction)):
        writer.writerow({'id': Ids[r], 'match': prediction[r]})