# I. Setting up the Problem

In [1]:
import pandas as pd
import numpy as np

# Import the random forest package
from sklearn.ensemble import RandomForestClassifier 

In [2]:
filename ="CrowdstormingDataJuly1st.csv"
Data = pd.read_csv(filename)

### 1) Peeking into the Data

In [3]:
Data.ix[:10,:13]

Unnamed: 0,playerShort,player,club,leagueCountry,birthday,height,weight,position,games,victories,ties,defeats,goals
0,lucas-wilchez,Lucas Wilchez,Real Zaragoza,Spain,31.08.1983,177.0,72.0,Attacking Midfielder,1,0,0,1,0
1,john-utaka,John Utaka,Montpellier HSC,France,08.01.1982,179.0,82.0,Right Winger,1,0,0,1,0
2,abdon-prats,Abdón Prats,RCD Mallorca,Spain,17.12.1992,181.0,79.0,,1,0,1,0,0
3,pablo-mari,Pablo Marí,RCD Mallorca,Spain,31.08.1993,191.0,87.0,Center Back,1,1,0,0,0
4,ruben-pena,Rubén Peña,Real Valladolid,Spain,18.07.1991,172.0,70.0,Right Midfielder,1,1,0,0,0
5,aaron-hughes,Aaron Hughes,Fulham FC,England,08.11.1979,182.0,71.0,Center Back,1,0,0,1,0
6,aleksandar-kolarov,Aleksandar Kolarov,Manchester City,England,10.11.1985,187.0,80.0,Left Fullback,1,1,0,0,0
7,alexander-tettey,Alexander Tettey,Norwich City,England,04.04.1986,180.0,68.0,Defensive Midfielder,1,0,0,1,0
8,anders-lindegaard,Anders Lindegaard,Manchester United,England,13.04.1984,193.0,80.0,Goalkeeper,1,0,1,0,0
9,andreas-beck,Andreas Beck,1899 Hoffenheim,Germany,13.03.1987,180.0,70.0,Right Fullback,1,1,0,0,0


In [4]:
#Data.ix[:10,13:28]

# II. Preparing the training & test data : Unique Game Row version

### 1) Keep only players that have a Rater Image

In [4]:
# 1) Remove the players without rater 1 / 2 rating because we won't be 
# able to train or test the values (this can be done as bonus later)

Data_hasImage = Data[pd.notnull(Data['photoID'])]
#Data_hasImage.ix[:10,13:28]

### 2) Disaggregate the data so each row is 1 game

Got a lot of help from this script ! https://osf.io/w7tds/
It will be much simpler for us to train our random forest if each row corresponds to one game. This way, we won't have to give a different "weight" to each row according to the number of played games.

But let's start by doing the mean value of rater1 and rater 2, because if we keep them separated we might get some strange results.
Indeed, what if for a player, rater1 = 0.0 and rater2 = 0.75 ?
It would not make a lot of sense, or at least we would know our model is not viable !

In [5]:
Data_hasImage['mean_rater']=(Data_hasImage['rater1']+Data_hasImage['rater2'])/2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Let's now disaggregate the games:

In [6]:
game_counter = 0
game_total_number = sum(Data_hasImage['games'])
# Raw table that we'll have to convert to a dataframe later
output = [0 for i in range(game_total_number)]

# We now iterate each row of our dataframe, which may contains more that one game
for i, row in Data_hasImage.iterrows():
    # Number of games in the current row
    row_game_number = row['games']
    # Number of cumulated cards for the games in the current row
    yellowCards = row['yellowCards']
    yellowReds = row['yellowReds']
    redCards = row['redCards']
    # We want to seperate each of these games    
    for j in range (row_game_number):
        game = row
        game['yellowCards'] = 0
        game['yellowReds'] = 0
        game['redCards'] = 0
        # Basically, we distribute the cards we have on separate games.
        # ie: if we have 2 yellowCard and 1 redCard for a total of 4 games,
        # the first two games will be assigned a yellowCard,
        # the third game will be assigned a redCard,
        # and the last game won't have any card assigned, because there is no card left.        
        if yellowCards > 0:
            game['yellowCards'] = 1
            yellowCards = yellowCards - 1
        elif yellowReds > 0:
            game['yellowReds'] = 1
            yellowReds = yellowReds - 1
        elif redCards > 0:
            game['redCards'] = 1
            redCards = redCards - 1
            
        # Convert from pandas Series to prevent overwriting previous values of the output
        gamelist=list(game)
        # Add the new game to the output
        output[game_counter] = gamelist
        game_counter = game_counter + 1

# Here is the output dataframe

Data_OneGamePerRow = pd.DataFrame(output, columns=list(Data_hasImage.columns))
Data_OneGamePerRow

Unnamed: 0,playerShort,player,club,leagueCountry,birthday,height,weight,position,games,victories,...,refNum,refCountry,Alpha_3,meanIAT,nIAT,seIAT,meanExp,nExp,seExp,mean_rater
0,lucas-wilchez,Lucas Wilchez,Real Zaragoza,Spain,31.08.1983,177.0,72.0,Attacking Midfielder,1,0,...,1,1,GRC,0.326391,712.0,0.000564,0.396000,750.0,0.002696,0.375
1,john-utaka,John Utaka,Montpellier HSC,France,08.01.1982,179.0,82.0,Right Winger,1,0,...,2,2,ZMB,0.203375,40.0,0.010875,-0.204082,49.0,0.061504,0.750
2,aaron-hughes,Aaron Hughes,Fulham FC,England,08.11.1979,182.0,71.0,Center Back,1,0,...,4,4,LUX,0.325185,127.0,0.003297,0.538462,130.0,0.013752,0.125
3,aleksandar-kolarov,Aleksandar Kolarov,Manchester City,England,10.11.1985,187.0,80.0,Left Fullback,1,1,...,4,4,LUX,0.325185,127.0,0.003297,0.538462,130.0,0.013752,0.125
4,alexander-tettey,Alexander Tettey,Norwich City,England,04.04.1986,180.0,68.0,Defensive Midfielder,1,0,...,4,4,LUX,0.325185,127.0,0.003297,0.538462,130.0,0.013752,1.000
5,anders-lindegaard,Anders Lindegaard,Manchester United,England,13.04.1984,193.0,80.0,Goalkeeper,1,0,...,4,4,LUX,0.325185,127.0,0.003297,0.538462,130.0,0.013752,0.250
6,andreas-beck,Andreas Beck,1899 Hoffenheim,Germany,13.03.1987,180.0,70.0,Right Fullback,1,1,...,4,4,LUX,0.325185,127.0,0.003297,0.538462,130.0,0.013752,0.000
7,antonio-rukavina,Antonio Rukavina,Real Valladolid,Spain,26.01.1984,177.0,74.0,Right Fullback,2,2,...,4,4,LUX,0.325185,127.0,0.003297,0.538462,130.0,0.013752,0.000
8,antonio-rukavina,Antonio Rukavina,Real Valladolid,Spain,26.01.1984,177.0,74.0,Right Fullback,2,2,...,4,4,LUX,0.325185,127.0,0.003297,0.538462,130.0,0.013752,0.000
9,ashkan-dejagah,Ashkan Dejagah,Fulham FC,England,05.07.1986,181.0,74.0,Left Winger,1,1,...,4,4,LUX,0.325185,127.0,0.003297,0.538462,130.0,0.013752,0.500


### 3) Create the Training and Testing Datframes with only select data

In [70]:
# Removing columns that we do not need
Data_Simple1 = Data_OneGamePerRow[['playerShort', 'yellowCards', 'yellowReds', 'redCards',
                              'refNum', 'refCountry', 'mean_rater', 'games']]

# Take a random 80% sample of the Data for the Training Sample
#Data_Training = Data_Simple1.sample(frac=0.8)

# Take a random 20% sample of the Data for the Testing Sample
#Data_Testing = Data_Simple1.loc[~Data_Simple1.index.isin(Data_Training.index)]

In [71]:
Data_Simple1

Unnamed: 0,playerShort,yellowCards,yellowReds,redCards,refNum,refCountry,mean_rater,games
0,lucas-wilchez,0,0,0,1,1,0.375,1
1,john-utaka,1,0,0,2,2,0.750,1
2,aaron-hughes,0,0,0,4,4,0.125,1
3,aleksandar-kolarov,0,0,0,4,4,0.125,1
4,alexander-tettey,0,0,0,4,4,1.000,1
5,anders-lindegaard,0,0,0,4,4,0.250,1
6,andreas-beck,0,0,0,4,4,0.000,1
7,antonio-rukavina,1,0,0,4,4,0.000,2
8,antonio-rukavina,0,0,0,4,4,0.000,2
9,ashkan-dejagah,0,0,0,4,4,0.500,1


In [73]:
#find proportion of yellow & red cards to games
Data_Simple1['fractionYellow'] = Data_Simple1['yellowCards']/Data_Simple1['games']
Data_Simple1['fractionYellowRed'] = Data_Simple1['yellowReds']/Data_Simple1['games']
Data_Simple1['fractionRed'] = Data_Simple1['redCards']/Data_Simple1['games']
Data_Simple1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,playerShort,yellowCards,yellowReds,redCards,refNum,refCountry,mean_rater,games,fractionYellow,fractionYellowRed,fractionRed
0,lucas-wilchez,0,0,0,1,1,0.375,1,0.0,0.0,0.0
1,john-utaka,1,0,0,2,2,0.750,1,1.0,0.0,0.0
2,aaron-hughes,0,0,0,4,4,0.125,1,0.0,0.0,0.0
3,aleksandar-kolarov,0,0,0,4,4,0.125,1,0.0,0.0,0.0
4,alexander-tettey,0,0,0,4,4,1.000,1,0.0,0.0,0.0
5,anders-lindegaard,0,0,0,4,4,0.250,1,0.0,0.0,0.0
6,andreas-beck,0,0,0,4,4,0.000,1,0.0,0.0,0.0
7,antonio-rukavina,1,0,0,4,4,0.000,2,0.5,0.0,0.0
8,antonio-rukavina,0,0,0,4,4,0.000,2,0.0,0.0,0.0
9,ashkan-dejagah,0,0,0,4,4,0.500,1,0.0,0.0,0.0


In [74]:
colRate = ['mean_rater']
Col_Rating = Data_Simple1[colRate].values
Ratings_Scale = []; 
Col_Rating

array([[ 0.375],
       [ 0.75 ],
       [ 0.125],
       ..., 
       [ 0.25 ],
       [ 0.5  ],
       [ 0.125]])

In [75]:
# Must now convert this continuous scale into a categorical one, with 20 categories
A = len(Col_Rating)
for i in range (0,A):
    if Col_Rating[i] >= 0 and Col_Rating[i] <0.05:
        Ratings_Scale.append(1);
    elif Col_Rating[i] >= 0.05 and Col_Rating[i] <0.1:
        Ratings_Scale.append(2);
    elif Col_Rating[i] >= 0.1 and Col_Rating[i] <0.15:
        Ratings_Scale.append(3);
    elif Col_Rating[i] >= 0.15 and Col_Rating[i] <0.2:
        Ratings_Scale.append(4);
    elif Col_Rating[i] >= 0.2 and Col_Rating[i] <0.25:
        Ratings_Scale.append(5);
    elif Col_Rating[i] >= 0.25 and Col_Rating[i] <0.3:
        Ratings_Scale.append(6);
    elif Col_Rating[i] >= 0.3 and Col_Rating[i] <0.35:
        Ratings_Scale.append(7);
    elif Col_Rating[i] >= 0.35 and Col_Rating[i] <0.4:
        Ratings_Scale.append(8);
    elif Col_Rating[i] >= 0.4 and Col_Rating[i] <0.45:
        Ratings_Scale.append(9);
    elif Col_Rating[i] >= 0.45 and Col_Rating[i] <0.5:
        Ratings_Scale.append(10);
    elif Col_Rating[i] >= 0.5 and Col_Rating[i] <0.55:
        Ratings_Scale.append(11);
    elif Col_Rating[i] >= 0.55 and Col_Rating[i] <0.6:
        Ratings_Scale.append(12);
    elif Col_Rating[i] >= 0.6 and Col_Rating[i] <0.65:
        Ratings_Scale.append(13);
    elif Col_Rating[i] >= 0.65 and Col_Rating[i] <0.7:
        Ratings_Scale.append(14);
    elif Col_Rating[i] >= 0.7 and Col_Rating[i] <0.75:
        Ratings_Scale.append(15);
    elif Col_Rating[i] >= 0.75 and Col_Rating[i] <0.8:
        Ratings_Scale.append(16);
    elif Col_Rating[i] >= 0.8 and Col_Rating[i] <0.85:
        Ratings_Scale.append(17);
    elif Col_Rating[i] >= 0.85 and Col_Rating[i] <0.9:
        Ratings_Scale.append(18);
    elif Col_Rating[i] >= 0.9 and Col_Rating[i] <0.95:
        Ratings_Scale.append(19);
    elif Col_Rating[i] >= 0.95 and Col_Rating[i] <=1:
        Ratings_Scale.append(20);
    else:
        Ratings_Scale.append(99);
        
Data_Simple1['raterScale'] = Ratings_Scale
Data_Simple1.head()

## Some of the values in trainRes_1 are larger than one! We must delete them from the simple data set to avoid errors in the training process.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,playerShort,yellowCards,yellowReds,redCards,refNum,refCountry,mean_rater,games,fractionYellow,fractionYellowRed,fractionRed,raterScale
0,lucas-wilchez,0,0,0,1,1,0.375,1,0.0,0.0,0.0,8
1,john-utaka,1,0,0,2,2,0.75,1,1.0,0.0,0.0,16
2,aaron-hughes,0,0,0,4,4,0.125,1,0.0,0.0,0.0,3
3,aleksandar-kolarov,0,0,0,4,4,0.125,1,0.0,0.0,0.0,3
4,alexander-tettey,0,0,0,4,4,1.0,1,0.0,0.0,0.0,20


In [76]:
# drop values on scale which are equal to 99
Data_Simple2 = Data_Simple1[Data_Simple1.raterScale != 99]
Data_Simple2.dropna(axis=0)
Data_Simple2

Unnamed: 0,playerShort,yellowCards,yellowReds,redCards,refNum,refCountry,mean_rater,games,fractionYellow,fractionYellowRed,fractionRed,raterScale
0,lucas-wilchez,0,0,0,1,1,0.375,1,0.0,0.0,0.0,8
1,john-utaka,1,0,0,2,2,0.750,1,1.0,0.0,0.0,16
2,aaron-hughes,0,0,0,4,4,0.125,1,0.0,0.0,0.0,3
3,aleksandar-kolarov,0,0,0,4,4,0.125,1,0.0,0.0,0.0,3
4,alexander-tettey,0,0,0,4,4,1.000,1,0.0,0.0,0.0,20
5,anders-lindegaard,0,0,0,4,4,0.250,1,0.0,0.0,0.0,6
6,andreas-beck,0,0,0,4,4,0.000,1,0.0,0.0,0.0,1
7,antonio-rukavina,1,0,0,4,4,0.000,2,0.5,0.0,0.0,1
8,antonio-rukavina,0,0,0,4,4,0.000,2,0.0,0.0,0.0,1
9,ashkan-dejagah,0,0,0,4,4,0.500,1,0.0,0.0,0.0,11


# II. Preparing the training & test data : Fraction version

### 1) Create the Training and Testing Datframes with only select data

In [77]:
#create test and training matrix

cols = ['games', 'fractionYellow', 'fractionYellowRed', 'fractionRed', 'refNum', 'refCountry']
exclude = ['raterScale','mean_rater', 'playerShort', 'yellowCards','yellowReds','redCards', 'games']
colsRes1 = ['raterScale']


# Take a random 80% sample of the Data for the Training Sample
Data_Training = Data_Simple2.sample(frac=0.8)

# Need to split this into the data and the results columns
# http://stackoverflow.com/questions/34246336/python-randomforest-unknown-label-error
Input_Data_Training = Data_Training.drop(exclude, axis=1)

#Results_Data_Training = list(Data_Training.raterAvg.values)
Results_Data_Training = Data_Training[colsRes1]
Input_Data_Training.head()

Unnamed: 0,refNum,refCountry,fractionYellow,fractionYellowRed,fractionRed
267796,2165,44,0.0,0.0,0.0
228922,1909,44,0.0,0.0,0.0
324658,2639,3,0.0,0.0,0.0
52219,409,7,0.0,0.0,0.0
259015,2080,44,0.058824,0.0,0.0


In [14]:
# Take a random 20% sample of the Data for the Testing Sample
#Data_Testing = Data_Simple1.loc[~Data_Simple1.index.isin(Data_Training.index)]

# Need to split this into the data and the results columns
# http://stackoverflow.com/questions/34246336/python-randomforest-unknown-label-error
#Input_Data_Testing = Data_Testing.drop(colsRes, axis=1)
#Results_Data_Testing = list(Data_Testing.raterAvg.values)

In [78]:
# Need to make arrays
# http://www.analyticbridge.com/profiles/blogs/random-forest-in-python
trainArr = Input_Data_Training.as_matrix() #training array
#trainRes = Results_Data_Training.as_matrix(colsRes) #training results
trainRes_1 = Data_Training['raterScale'].values
trainArr


array([[  2.16500000e+03,   4.40000000e+01,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00],
       [  1.90900000e+03,   4.40000000e+01,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00],
       [  2.63900000e+03,   3.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00],
       ..., 
       [  7.70000000e+01,   4.40000000e+01,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00],
       [  3.04000000e+02,   3.00000000e+00,   7.14285714e-02,
          0.00000000e+00,   0.00000000e+00],
       [  1.64600000e+03,   8.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00]])

# III. Random Forest

In [80]:
#Initialize
forest = RandomForestClassifier(n_estimators = 100)

# Fit the training data and create the decision trees
forest = forest.fit(trainArr,trainRes_1)

# Take the same decision trees and run it on the test data
Data_Testing = Data_Simple2.sample(frac=0.2)
Input_Data_Testing = Data_Testing.drop(exclude, axis=1)
testArr = Input_Data_Testing.as_matrix()
results = forest.predict(testArr)

Data_Testing['predictions'] = results
Data_Testing.head()

Unnamed: 0,playerShort,yellowCards,yellowReds,redCards,refNum,refCountry,mean_rater,games,fractionYellow,fractionYellowRed,fractionRed,raterScale,predictions
73468,thomas-kahlenberg,1,0,0,494,8,0.0,3,0.333333,0.0,0.0,1,6
311018,wolfgang-hesl,0,0,0,2494,8,0.125,2,0.0,0.0,0.0,3,1
233995,stephan-hain,0,0,0,1937,8,0.125,4,0.0,0.0,0.0,3,1
163358,jesus-gamez,0,0,0,1349,3,0.375,7,0.0,0.0,0.0,8,6
36447,tom-trybull,0,0,0,298,8,0.125,1,0.0,0.0,0.0,3,1


In [84]:
#see percentage of right predictions
correct = list(Data_Testing[Data_Testing['raterScale'] == Data_Testing['predictions']].index)
A = len(correct)
percCorrect = A/Data_Testing['raterScale'].size
percCorrect

0.37304491174460214

The first attempt resulted in a 37% success of predicions with n_estimatos = 100. 

In [None]:
#See features importance
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()

In [None]:
#Initialize
forest = RandomForestClassifier(n_estimators = 500)

# Fit the training data and create the decision trees
forest = forest.fit(trainArr,trainRes_1)

# Take the same decision trees and run it on the test data
Data_Testing = Data_Simple2.sample(frac=0.2)
Input_Data_Testing = Data_Testing.drop(exclude, axis=1)
testArr = Input_Data_Testing.as_matrix()
results2 = forest.predict(testArr)

Data_Testing['predictions2'] = results2
Data_Testing.head()