# Import Library and Dataset

In [1]:
import numpy
import sklearn
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
np.random.seed(123)
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score,f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

In [2]:
df=pd.read_csv("qudditch_training.csv")
df['quidditch_league_player'] = (preprocessing.LabelEncoder().fit_transform(df.quidditch_league_player.values))
target1=df['quidditch_league_player']
df=df.drop(['quidditch_league_player'],axis=1)


# PART I: Preprocessing

#### Handling missing values. (If ANY)

In [3]:
df['house']=df['house'].replace('?','Unknown')
df['move_specialty']=df['move_specialty'].replace('?','Unknown')
df['player_code']=df['player_code'].replace('?','Unknown')
df=df.drop(['weight'],axis=1)

#### Feature Datatype Conversion From Numeric to categoric and Vice-versa. (If ANY)

In [4]:
#label encoding
def encode(df):
    df.astype(str)
    df=df.apply(LabelEncoder().fit_transform)
    return df

def cleandata(df):
#one hot encoding
    columns_dum=[ 'player_type']
    df=pd.get_dummies(df,columns=columns_dum)
#minmax scaling
    scaler = MinMaxScaler()
    scaler.fit(df)
    scaled_df=scaler.transform(df)
    df=pd.DataFrame(scaled_df,columns=df.columns)
    return df

#### Feature Reduction or extraction. (If ANY)

In [7]:
def features(df):
    
    df=encode(df)
    
    #replacing tatctics No=0 and rest equal to 1
    tactics_column=['body_blow','checking','dopplebeater_defence','hawkshead_attacking_formation','no_hands_tackle','power_play','sloth_grip_roll','spiral_dive','starfish_and_stick','twirl','wronski_feint','zig-zag','bludger_backbeat','chelmondiston_charge','dionysus_dive','double_eight_loop','finbourgh_flick','reverse_pass','parkins_pincer','plumpton_pass','porskoff_ploy','transylvanian_tackle','woollongong_shimmy']
    for i in range(len(tactics_column)):
        
        (df[tactics_column[i]])=df[tactics_column[i]].apply(lambda x: 0 if x != 0 else 1)
  
    #creating new features
    df['total_num_gamenotpart']=df[['num_games_injured','num_games_notpartof','num_games_satout']].sum(axis=1)
    df['total_number_tactics0']=(df[['body_blow', 'checking', 'dopplebeater_defence',
           'hawkshead_attacking_formation', 'no_hands_tackle', 'power_play',
           'sloth_grip_roll', 'spiral_dive', 'starfish_and_stick', 'twirl',
           'wronski_feint', 'zig-zag', 'bludger_backbeat', 'chelmondiston_charge',
           'dionysus_dive', 'reverse_pass',
           'parkins_pincer', 'plumpton_pass', 'porskoff_ploy',
           'transylvanian_tackle', 'woollongong_shimmy']]!=0).sum(axis=1)
    df['total_number_tactics1']=(df[['body_blow', 'checking', 'dopplebeater_defence',
           'hawkshead_attacking_formation', 'no_hands_tackle', 'power_play',
           'sloth_grip_roll', 'spiral_dive', 'starfish_and_stick', 'twirl',
           'wronski_feint', 'zig-zag', 'bludger_backbeat', 'chelmondiston_charge',
           'dionysus_dive', 'reverse_pass',
           'parkins_pincer', 'plumpton_pass', 'porskoff_ploy',
           'transylvanian_tackle', 'woollongong_shimmy']]==0).sum(axis=1)
    #replacing values
    df['stooging']=df['stooging'].replace('>7','High')
    df['stooging']=df['stooging'].replace('>8','High')
    df['snitchnip']=df['snitchnip'].replace('>200','High')
    df['snitchnip']=df['snitchnip'].replace('>300','High')

    #dropping unwanted features
    df=df.drop(['num_games_injured','num_games_notpartof','num_games_satout','change','double_eight_loop',
                'finbourgh_flick','body_blow', 'checking', 'dopplebeater_defence',
           'hawkshead_attacking_formation', 'no_hands_tackle', 'power_play',
           'sloth_grip_roll', 'spiral_dive', 'starfish_and_stick', 'twirl',
           'wronski_feint', 'zig-zag', 'bludger_backbeat', 'chelmondiston_charge',
           'dionysus_dive', 'reverse_pass',
           'parkins_pincer', 'plumpton_pass', 'porskoff_ploy',
           'transylvanian_tackle', 'woollongong_shimmy'],axis=1)
    return df

#### Any other Pre-processing Used. (Give the name along with the code.)

# Feature Selection

In [8]:
#applying smote before feature selection
data=features(df)
dataset=data
col=df.columns
os = SMOTE(random_state=0)
df,target=os.fit_sample(df,target1)
df=pd.DataFrame(df,columns=col)
sel = SelectFromModel(RandomForestClassifier(n_estimators =150))
sel.fit(dataset,target1)
sel.get_support()
selected_feat= dataset.columns[(sel.get_support())]
len(selected_feat)
print(selected_feat)

Index(['id_num', 'player_id', 'age', 'game_duration', 'num_game_moves',
       'num_practice_sessions', 'player_type', 'total_num_gamenotpart'],
      dtype='object')


# Dataset modification

In [9]:
#dropping unwanted features
data=dataset.drop(selected_feat,axis=1)
cleandata_train=dataset.drop(data.columns,axis=1)
clean_data=cleandata(cleandata_train)
clean_data


Unnamed: 0,id_num,player_id,age,game_duration,num_game_moves,num_practice_sessions,total_num_gamenotpart,player_type_0,player_type_1,player_type_2,player_type_3,player_type_4,player_type_5,player_type_6,player_type_7,player_type_8
0,0.000000,0.144850,0.000000,0.000000,0.341880,0.000000,0.000000,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000010,0.534177,0.111111,0.153846,0.495726,0.229730,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.000020,0.734217,0.222222,0.076923,0.085470,0.162162,0.076923,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.000030,0.696095,0.333333,0.076923,0.367521,0.202703,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.000040,0.451057,0.444444,0.000000,0.427350,0.094595,0.000000,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.000049,0.697050,0.555556,0.153846,0.256410,0.202703,0.000000,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
6,0.000059,0.705541,0.666667,0.230769,0.589744,0.270270,0.000000,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
7,0.000069,0.956770,0.777778,0.307692,0.615385,0.148649,0.000000,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
8,0.000079,0.500590,0.888889,0.923077,0.572650,0.364865,0.000000,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
9,0.000089,0.597198,1.000000,0.846154,0.273504,0.229730,0.000000,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


# Data Split

In [10]:
x_train, x_test, y_train, y_test = np.array(train_test_split(clean_data,target1, train_size = 0.7, ))

# Smote(sampling data)

In [11]:
os = SMOTE(random_state=0)
#sme = SMOTEENN(random_state=0,verbose=1)
os_data_X,os_data_y=os.fit_sample(x_train,y_train)
os_data_X

array([[0.47672937, 0.23279903, 0.77777778, ..., 0.        , 0.        ,
        0.        ],
       [0.23116575, 0.59768969, 0.77777778, ..., 0.        , 0.        ,
        0.        ],
       [0.45907273, 0.73344468, 0.77777778, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.28603116, 0.67713878, 0.77777778, ..., 0.        , 0.        ,
        1.        ],
       [0.63351787, 0.48371902, 0.22222222, ..., 0.        , 0.        ,
        0.        ],
       [0.02646202, 0.08117609, 0.44444444, ..., 0.        , 0.        ,
        0.        ]])

# PART II: Classification

# Model 1:
Model Name:Logistic regression<br>
Evaluation method and metric used Name:classification report ,confusion matrix,Area under the curve score<br>
Name of the Hyperparameter used: penalty:l2,random_state,solver<br>


In [12]:
new= LogisticRegression(penalty='l2',random_state=3,solver='saga').fit(os_data_X,os_data_y)
pred_y_0 = new.predict(x_test)

In [13]:
print("Accuracy:",(accuracy_score(y_test,pred_y_0)*100))
print("classification_report:\n",classification_report(y_test,pred_y_0))
print("confusion matrix:\n",confusion_matrix(y_test,pred_y_0,))
print("\nArea under the curve:",roc_auc_score(y_test,pred_y_0))

Accuracy: 64.45358788676761
classification_report:
               precision    recall  f1-score   support

           0       0.91      0.66      0.77     27009
           1       0.16      0.51      0.24      3371

   micro avg       0.64      0.64      0.64     30380
   macro avg       0.54      0.58      0.50     30380
weighted avg       0.83      0.64      0.71     30380

confusion matrix:
 [[17876  9133]
 [ 1666  1705]]

Area under the curve: 0.5838190449475958
17876 9133 1666 1705


### Model 2:
Model Name:Random Forest classifier<br>
Evaluation method and metric used Name:classification report ,confusion matrix,Area under the curve score<br>
Name of the Hyperparameter used:(n_estimators=20,verbose=1,n_jobs=9,criterion='gini'<br>


In [14]:
clf_rf = RandomForestClassifier(n_estimators=20,verbose=1,n_jobs=9,criterion='gini')
clf_rf.fit(os_data_X,os_data_y)
new=clf_rf.predict(x_test)

[Parallel(n_jobs=9)]: Using backend ThreadingBackend with 9 concurrent workers.
[Parallel(n_jobs=9)]: Done  20 out of  20 | elapsed:    2.4s finished
[Parallel(n_jobs=9)]: Using backend ThreadingBackend with 9 concurrent workers.
[Parallel(n_jobs=9)]: Done  20 out of  20 | elapsed:    0.0s finished


In [15]:
print("Accuracy:",(accuracy_score(y_test,new)*100))
print("classification_report:\n",classification_report(y_test,new))
print("confusion matrix:\n",confusion_matrix(y_test,new))
print("\nArea under the curve:",roc_auc_score(y_test,new))

Accuracy: 87.63331138907175
classification_report:
               precision    recall  f1-score   support

           0       0.89      0.98      0.93     27009
           1       0.19      0.04      0.06      3371

   micro avg       0.88      0.88      0.88     30380
   macro avg       0.54      0.51      0.50     30380
weighted avg       0.81      0.88      0.84     30380

confusion matrix:
 [[26505   504]
 [ 3253   118]]

Area under the curve: 0.5081720016001785
17876 9133 1666 1705


### Model 3:
Model Name:Neural Network<br>
Evaluation method and metric used Name:classification report ,confusion matrix,Area under the curve score<br>
Name of the Hyperparameter used:solver='adam', alpha=1e-5,hidden_layer_sizes=(16,8,3),verbose=1, random_state=3,momentum=0.9,max_iter=300<br>


In [16]:
clf = MLPClassifier(solver='adam', alpha=1e-5,hidden_layer_sizes=(16,8,3),verbose=1, random_state=3,momentum=0.9,max_iter=500)
clf.fit(os_data_X,os_data_y)
new=clf.predict(x_test)

Iteration 1, loss = 0.68741798
Iteration 2, loss = 0.66934869
Iteration 3, loss = 0.65825524
Iteration 4, loss = 0.65223543
Iteration 5, loss = 0.64862459
Iteration 6, loss = 0.64651140
Iteration 7, loss = 0.64374301
Iteration 8, loss = 0.64069116
Iteration 9, loss = 0.63800029
Iteration 10, loss = 0.63542771
Iteration 11, loss = 0.63301147
Iteration 12, loss = 0.63077697
Iteration 13, loss = 0.62923828
Iteration 14, loss = 0.62827669
Iteration 15, loss = 0.62685949
Iteration 16, loss = 0.62555033
Iteration 17, loss = 0.62208633
Iteration 18, loss = 0.61926283
Iteration 19, loss = 0.61567969
Iteration 20, loss = 0.61036294
Iteration 21, loss = 0.60626148
Iteration 22, loss = 0.60115082
Iteration 23, loss = 0.59740684
Iteration 24, loss = 0.59292966
Iteration 25, loss = 0.58983934
Iteration 26, loss = 0.58683379
Iteration 27, loss = 0.58445713
Iteration 28, loss = 0.58190478
Iteration 29, loss = 0.58018444
Iteration 30, loss = 0.57861778
Iteration 31, loss = 0.57687374
Iteration 32, los

In [17]:
print("Accuracy:",(accuracy_score(y_test,new)*100))
print("classification_report:\n",classification_report(y_test,new))
print("confusion matrix:\n",confusion_matrix(y_test,new))
print("\nArea under the curve:",roc_auc_score(y_test,new))

Accuracy: 73.46280447662936
classification_report:
               precision    recall  f1-score   support

           0       0.91      0.78      0.84     27009
           1       0.17      0.35      0.23      3371

   micro avg       0.73      0.73      0.73     30380
   macro avg       0.54      0.57      0.53     30380
weighted avg       0.82      0.73      0.77     30380

confusion matrix:
 [[21130  5879]
 [ 2183  1188]]

Area under the curve: 0.5673747477672028
17876 9133 1666 1705


# PART III: Best Hypothesis:
Model Name:Neural Network<br>
Reason:Best recall and f-1score<br>
Hyper-parameter Value:solver='adam', alpha=1e-5,hidden_layer_sizes=(16,8,3),verbose=1, random_state=3,momentum=0.9,max_iter=300<br>


In [18]:
os = SMOTE(random_state=0)
#sme = SMOTEENN(random_state=0,verbose=1)
os_data_X,os_data_y=os.fit_sample(clean_data,target1)
clf = MLPClassifier(solver='adam', alpha=1e-5,hidden_layer_sizes=(16,8,3),verbose=1, random_state=3,momentum=0.9,max_iter=500)
clf.fit(os_data_X,os_data_y)

Iteration 1, loss = 0.68385114
Iteration 2, loss = 0.66477726
Iteration 3, loss = 0.65654959
Iteration 4, loss = 0.65149116
Iteration 5, loss = 0.64760503
Iteration 6, loss = 0.64317341
Iteration 7, loss = 0.63908233
Iteration 8, loss = 0.63539085
Iteration 9, loss = 0.63228860
Iteration 10, loss = 0.62888810
Iteration 11, loss = 0.62401528
Iteration 12, loss = 0.61799227
Iteration 13, loss = 0.61007677
Iteration 14, loss = 0.60273436
Iteration 15, loss = 0.59615102
Iteration 16, loss = 0.59176286
Iteration 17, loss = 0.58889278
Iteration 18, loss = 0.58585854
Iteration 19, loss = 0.58368921
Iteration 20, loss = 0.58217981
Iteration 21, loss = 0.58094340
Iteration 22, loss = 0.57925460
Iteration 23, loss = 0.57829698
Iteration 24, loss = 0.57740535
Iteration 25, loss = 0.57638653
Iteration 26, loss = 0.57547912
Iteration 27, loss = 0.57496218
Iteration 28, loss = 0.57428907
Iteration 29, loss = 0.57342919
Iteration 30, loss = 0.57227338
Iteration 31, loss = 0.57176125
Iteration 32, los

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(16, 8, 3), learning_rate='constant',
       learning_rate_init=0.001, max_iter=500, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=3, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=1, warm_start=False)

# For test dataset<br>
first run the data set for functions:-features,cleandata and then update data

In [39]:
#adding missing columns in test dataset
def update(dataset):
    data=dataset.drop(selected_feat,axis=1)
    dataset=dataset.drop(data.columns,axis=1)
    print(dataset)
    dataset=cleandata(dataset)
    print(dataset)
    a=list(set(clean_data.columns)-set(dataset.columns))
    for i in a:
        dataset[i]=0
    return dataset

In [43]:
dataset=pd.read_csv("qudditch_testing_without_target _2_.csv")
new_df=pd.DataFrame()
new_df['id_num']=dataset['id_num']
dataset=features(dataset)
dataset=update(dataset)


     id_num  player_id  age  game_duration  num_game_moves  \
0         0        120    5              2              16   
1         1        426    6              1              39   
2         2        306    7              3              73   
3         3         23    4              9              63   
4         4        312    3              1              33   
5         5        397    7              1               0   
6         6        263    7              5              57   
7         7        239    6              0              27   
8         8         90    6              4              69   
9         9        315    4              5              64   
10       10        184    3              1               0   
11       11        100    5              7              62   
12       12         87    6              1              49   
13       13        337    5             11              30   
14       14        235    4             10               2   
15      

In [44]:
new=clf.predict(dataset)

In [45]:
new_df['quidditch_league_player']=new
new_df['quidditch_league_player'].replace(0.0,'NO',inplace=True)
new_df['quidditch_league_player'].replace(1.0,'YES',inplace=True)
new_df.to_csv('test_output.csv',index=None)