In [1]:
import numpy as np 
import pandas as pd 
from pandas import Series, DataFrame 
import xgboost as xgb 

In [2]:
TRAIN_DATA_PATH = "Data/train.csv.zip"
TEST_DATA_PATH = "Data/test.csv.zip"
train = pd.read_csv(TRAIN_DATA_PATH)
test = pd.read_csv(TEST_DATA_PATH)

In [3]:
train.columns=['id','comment_text','toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'] 

In [4]:
train.head() 

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [5]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
l=LabelEncoder() 
l.fit(train.comment_text.astype(str)) 
#df[cat] = le.fit_transform(df[cat].astype(str))

LabelEncoder()

In [6]:
l.classes_ 
train.comment_text=Series(l.transform(train.comment_text.astype(str)))  #label encoding our target variable 
train.comment_text.value_counts() 

2047      1
60727     1
42302     1
48445     1
46396     1
36155     1
34106     1
40249     1
38200     1
58678     1
15661     1
64821     1
62772     1
52531     1
50482     1
56625     1
54576     1
11567     1
44351     1
152928    1
154977    1
148834    1
85379     1
83330     1
89473     1
87424     1
142703    1
140654    1
146797    1
144748    1
         ..
49869     1
51916     1
62155     1
64202     1
58057     1
60104     1
37575     1
39622     1
33477     1
35524     1
13011     1
725       1
105190    1
6870      1
99045     1
101092    1
111331    1
113378    1
107233    1
109280    1
21215     1
23262     1
17117     1
19164     1
29403     1
31450     1
25305     1
27352     1
4823      1
0         1
Name: comment_text, Length: 159571, dtype: int64

In [7]:
#One Hot Encoding of the Categorical features 
one_hot_toxic=pd.get_dummies(train.toxic.astype(str)) 
one_hot_severe_toxic=pd.get_dummies(train.severe_toxic.astype(str)) 
one_hot_obscene=pd.get_dummies(train.obscene.astype(str)) 
one_hot_threat=pd.get_dummies(train.threat.astype(str))
one_hot_insult=pd.get_dummies(train.insult.astype(str)) 
one_hot_identity_hate=pd.get_dummies(train.identity_hate.astype(str))

In [8]:
train.drop(['id','toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'],axis=1,inplace=True) 

In [9]:
#Merging one hot encoded features with our dataset 'data' 
data=pd.concat([train,one_hot_toxic,one_hot_severe_toxic,one_hot_obscene,one_hot_threat,one_hot_insult,one_hot_identity_hate],axis=1) 
data.head()


Unnamed: 0,comment_text,0,1,0.1,1.1,0.2,1.2,0.3,1.3,0.4,1.4,0.5,1.5
0,72696,1,0,1,0,1,0,1,0,1,0,1,0
1,68357,1,0,1,0,1,0,1,0,1,0,1,0
2,79592,1,0,1,0,1,0,1,0,1,0,1,0
3,35519,1,0,1,0,1,0,1,0,1,0,1,0
4,146426,1,0,1,0,1,0,1,0,1,0,1,0


In [10]:
#removing dulpicate columns 
u, i = np.unique(data.columns, return_index=True) 
data=data.iloc[:, i] 

In [11]:
#Here our target variable is 'Income' with values as 1 or 0.  
#Separating our data into features dataset x and our target dataset y 
x=data.drop('comment_text',axis=1) 
y=data.comment_text 


In [12]:
#Imputing missing values in our target variable 
y.fillna(y.mode()[0],inplace=True) 

#Now splitting our dataset into test and train 
from sklearn.model_selection import train_test_split 
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.3)

In [13]:
#The data is stored in a DMatrix object 
#label is used to define our outcome variable
dtrain=xgb.DMatrix(x_train,label=y_train)
dtest=xgb.DMatrix(x_test)

In [14]:
#setting parameters for xgboost
parameters={'max_depth':7, 'eta':1, 'silent':1,'objective':'binary:logistic','eval_metric':'auc','learning_rate':.05}

In [None]:
#training our model 
num_round=50
from datetime import datetime 
start = datetime.now() 
xg=xgb.train(parameters,dtrain,num_round) 
stop = datetime.now()

In [None]:
#Execution time of the model 
execution_time_xgb = stop-start 
execution_time_xgb
#now predicting our model on test set 
ypred=xg.predict(dtest) 
ypred

#Converting probabilities into 1 or 0  
for i in range(0,9769): 
    if ypred[i]>=.5:       # setting threshold to .5 
       ypred[i]=1 
    else: 
       ypred[i]=0  

In [None]:
#calculating accuracy of our model 
from sklearn.metrics import accuracy_score 
accuracy_xgb = accuracy_score(y_test,ypred) 
accuracy_xgb