In [16]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
import os

In [17]:
from glob import glob

file_name = glob('edu/academy_edu/*.csv')[0]
df = pd.read_csv(file_name)
df.head()

Unnamed: 0,gender,NationalITy,PlaceofBirth,StageID,GradeID,SectionID,Topic,Semester,Relation,raisedhands,VisITedResources,AnnouncementsView,Discussion,ParentAnsweringSurvey,ParentschoolSatisfaction,StudentAbsenceDays,Class
0,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,15,16,2,20,Yes,Good,Under-7,M
1,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,20,20,3,25,Yes,Good,Under-7,M
2,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,10,7,0,30,No,Bad,Above-7,L
3,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,30,25,5,35,No,Bad,Above-7,L
4,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,40,50,12,50,No,Bad,Above-7,M


In [18]:
df['NationalITy'] = df['NationalITy'].str.replace('KW', 'KuwaIT')
df['NationalITy'] 

0      KuwaIT
1      KuwaIT
2      KuwaIT
3      KuwaIT
4      KuwaIT
        ...  
475    Jordan
476    Jordan
477    Jordan
478    Jordan
479    Jordan
Name: NationalITy, Length: 480, dtype: object

In [19]:
df["Topic_class"] = df["Topic"]

In [20]:
science = ['IT', 'Math', 'Science', 'Biology', 'Chemistry', 'Geology']
df.loc[df["Topic"].isin(science),"Topic_class"] = "S"
df.loc[~ df["Topic"].isin(science),"Topic_class"] = "C"

In [21]:

df.loc[df["NationalITy"] == df["PlaceofBirth"], "immigration"] = 0
df.loc[df["NationalITy"] != df["PlaceofBirth"], "immigration"] = 1
df.head()

Unnamed: 0,gender,NationalITy,PlaceofBirth,StageID,GradeID,SectionID,Topic,Semester,Relation,raisedhands,VisITedResources,AnnouncementsView,Discussion,ParentAnsweringSurvey,ParentschoolSatisfaction,StudentAbsenceDays,Class,Topic_class,immigration
0,M,KuwaIT,KuwaIT,lowerlevel,G-04,A,IT,F,Father,15,16,2,20,Yes,Good,Under-7,M,S,0.0
1,M,KuwaIT,KuwaIT,lowerlevel,G-04,A,IT,F,Father,20,20,3,25,Yes,Good,Under-7,M,S,0.0
2,M,KuwaIT,KuwaIT,lowerlevel,G-04,A,IT,F,Father,10,7,0,30,No,Bad,Above-7,L,S,0.0
3,M,KuwaIT,KuwaIT,lowerlevel,G-04,A,IT,F,Father,30,25,5,35,No,Bad,Above-7,L,S,0.0
4,M,KuwaIT,KuwaIT,lowerlevel,G-04,A,IT,F,Father,40,50,12,50,No,Bad,Above-7,M,S,0.0


In [22]:
df['StageID_value'] = df['StageID'].map(dict(lowerlevel=0,MiddleSchool=1,HighSchool=2))#ordinary encoding
df['StageID_value'].sample(5)

248    1
215    1
359    0
161    1
199    1
Name: StageID_value, dtype: int64

In [23]:
df['Class_value'] = df['Class'].map(dict(L=-1, M=0, H=1))#ordinary encoding
df['Class_value'].sample(5)

164    0
329    0
76     0
457    1
32    -1
Name: Class_value, dtype: int64

In [24]:
from sklearn import preprocessing

df['raisedhands'] = preprocessing.scale(df['raisedhands'])
df['VisITedResources'] = preprocessing.scale(df['VisITedResources'])
df['AnnouncementsView'] = preprocessing.scale(df['AnnouncementsView'])
df['Discussion'] = preprocessing.scale(df['Discussion'])
df[['Discussion','AnnouncementsView','VisITedResources','raisedhands']].sample(5)

Unnamed: 0,Discussion,AnnouncementsView,VisITedResources,raisedhands
395,1.040122,0.604933,1.30735,0.755355
353,-1.096867,-0.636438,1.034999,-0.708196
462,1.655865,1.282045,0.974476,0.820401
266,-0.951987,-0.974994,0.762647,-0.903336
172,-1.096867,0.567316,-0.992507,-0.870813


In [25]:
df.columns

Index(['gender', 'NationalITy', 'PlaceofBirth', 'StageID', 'GradeID',
       'SectionID', 'Topic', 'Semester', 'Relation', 'raisedhands',
       'VisITedResources', 'AnnouncementsView', 'Discussion',
       'ParentAnsweringSurvey', 'ParentschoolSatisfaction',
       'StudentAbsenceDays', 'Class', 'Topic_class', 'immigration',
       'StageID_value', 'Class_value'],
      dtype='object')

In [26]:
X = pd.get_dummies(df.drop(['ParentschoolSatisfaction','Class','Class_value', 'immigration', 'Topic_class'], axis=1), 
                  columns=['gender', 'NationalITy', 'PlaceofBirth', 'StageID', 
                  'GradeID', 'SectionID', 'Topic', 'Semester', 'Relation',
                  'ParentAnsweringSurvey',
                  'StudentAbsenceDays'],
                   drop_first = True)
y = df['Class']



In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [28]:
model_xgb = XGBClassifier(max_depth=10, n_estimators=200)
model_xgb.fit(X_train, y_train)





KeyboardInterrupt: 

In [None]:
pred = model_xgb.predict(X_test)
print(classification_report(y_test,pred))

In [None]:
fig = plt.figure(figsize=(25, 8))
plt.bar(X.columns, model_xgb.feature_importances_)
plt.xticks(rotation=90)
plt.show()