In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [2]:
Data=pd.read_csv(r'../input/globalterrorismdb.csv',encoding='ISO-8859-1',low_memory=False)

Now, we will see the features with  missing values greater than 100,000 and sort them.

In [3]:
missing = Data.isnull().sum()
missing = missing[missing > 100000]
missing.sort_values(inplace=True)
missing.head(10)

scite2             100765
propextent         109805
propextent_txt     109805
propcomment        115919
motive             121764
location           122376
scite3             131133
propvalue          134863
alternative        143719
alternative_txt    143719
dtype: int64

Let's drop the features with missing values  more than 100,000

In [4]:
Data.drop(missing[missing>100000].index,axis=1,inplace=True)

We will also drop features which are explaining the features as we have the encoded feature of the same.

In [5]:
Data.drop(['country_txt','region_txt','attacktype1_txt',
           'targtype1_txt','natlty1_txt','targsubtype1_txt','weaptype1_txt','weapsubtype1_txt'],axis=1,inplace=True)

In [6]:
#Finding the categorical variables
Categ = [f for f in Data.columns if Data.dtypes[f] == 'object']
Categ

['provstate',
 'city',
 'summary',
 'corp1',
 'target1',
 'gname',
 'weapdetail',
 'scite1',
 'dbsource']

In our Target variable  **'gname' **, we have around 1700 group names where we have only 1 row for the corresponding group name.
So, we will create duplicate rows for these group names as we will only use half of the dataset and we don't want some of the group names missing.

In [7]:
z=Data['gname'].value_counts().to_frame()
M=z['gname'][z['gname']==1]
M=M.to_frame()
M['groupname']=M.index
del missing

In [8]:
for i in M['groupname']:
    A=Data['gname']==i
    df=Data[A]
    Data=Data.append([df]*4,ignore_index=True)

In [9]:
#Double checking to have a minimum 4 rows for every gname name 
aa=Data['gname'].value_counts().to_frame()
bb=aa['gname'][aa['gname']<4]
bb=bb.to_frame()
bb['groupname']=bb.index

In [10]:
for i in bb['groupname']:
    A=Data['gname']==i
    df=Data[A]
    Data=Data.append([df]*2,ignore_index=True)

In [11]:
#Let's encode categorical variables with numbers
x=Data[Categ].apply(lambda x: pd.factorize(x)[0])

In [12]:
#Updating our main dataframe with encoded categorical features
for i in Categ:
    Data[i]=x[i]

In [13]:
#Filling missing values with the mode of the feature
Data=Data.fillna(Data.mode().iloc[0])
del Categ
Data.head()

Unnamed: 0,eventid,iyear,imonth,iday,extended,country,region,provstate,city,latitude,...,nwoundte,property,ishostkid,ransom,scite1,dbsource,INT_LOG,INT_IDEO,INT_MISC,INT_ANY
0,197000000001,1970,7,2,0,58,2,-1,0,18.456792,...,0.0,0,0.0,0.0,-1,0,0,0,0,0
1,197000000002,1970,0,0,0,130,1,-1,1,19.432608,...,0.0,0,1.0,1.0,-1,0,0,1,1,1
2,197001000001,1970,1,0,0,160,5,0,2,15.478598,...,0.0,0,0.0,0.0,-1,0,-9,-9,1,1
3,197001000002,1970,1,0,0,78,8,1,3,37.983773,...,0.0,1,0.0,0.0,-1,0,-9,-9,1,1
4,197001000003,1970,1,0,0,101,4,-1,4,33.580412,...,0.0,1,0.0,0.0,-1,0,-9,-9,1,1


In [14]:
Data.corr()['gname'].head()

eventid     0.312814
iyear       0.312780
imonth      0.013099
iday        0.006893
extended    0.134767
Name: gname, dtype: float64

In [15]:
#Creating final dataframe with features having correlation greater than 0.1 with target variable 'gname'
Data_final=Data[Data.corr()['gname'][Data.corr()['gname']>0.1].index]
Data_final.head()

Unnamed: 0,eventid,iyear,extended,region,provstate,city,longitude,summary,doubtterr,multiple,...,guncertain1,claimed,weaptype1,weapdetail,nkillter,scite1,dbsource,INT_LOG,INT_IDEO,INT_ANY
0,197000000001,1970,0,2,-1,0,-69.951164,-1,0,0,...,0.0,0.0,13,-1,0.0,-1,0,0,0,0
1,197000000002,1970,0,1,-1,1,-99.133207,-1,0,0,...,0.0,0.0,13,-1,0.0,-1,0,0,1,1
2,197001000001,1970,0,5,0,2,120.599741,-1,0,0,...,0.0,0.0,13,-1,0.0,-1,0,-9,-9,1
3,197001000002,1970,0,8,1,3,23.728157,-1,0,0,...,0.0,0.0,6,0,0.0,-1,0,-9,-9,1
4,197001000003,1970,0,4,-1,4,130.396361,-1,-9,0,...,0.0,0.0,8,1,0.0,-1,0,-9,-9,1


In [16]:
y=Data_final['gname']

In [17]:
Data_final.drop('gname',axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [18]:
X=Data_final

In [19]:
#Splitting the dataset into half as we will use only half of it for our model
from sklearn.model_selection import train_test_split
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y,train_size=0.5,stratify=y,random_state=42)



In [20]:
del z,M,aa,bb,X_test1, y_test1
del X,y, Data_final,Data

In [None]:
#Let's see the important features of our model
forest = RandomForestClassifier(n_estimators=20,max_features=None,criterion='entropy',random_state=2)
forest.fit(X_train1, y_train1)
importances = forest.feature_importances_


In [None]:
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

In [None]:
print("Feature ranking:")

for f in range(X_train1.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# Plotting the feature importances of the forest    
plt.figure()
plt.title("Feature importances")
plt.bar(range(X_train1.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(X_train1.shape[1]), indices)
plt.xlim([-1, X_train1.shape[1]])
plt.show()

In [None]:
#from sklearn.model_selection import GridSearchCV
#param_grid = { 'n_estimators': [10,100],'max_features': ['auto', 'sqrt', 'log2'] }
#forest = RandomForestClassifier()
#grid_search = GridSearchCV(forest, param_grid, cv=5)
#grid_search.fit(X_train, y_train)
#print("Train set score: {:.2f}".format(grid_search.score(X_train, y_train)))
#print("Test set score: {:.2f}".format(grid_search.score(X_test, y_test)))

In [21]:
#we will use the top six features for our model
temp=X_train1[['region','provstate','longitude','eventid','INT_LOG','scite1']]

In [22]:
del X_train1

In [25]:
kfold = KFold(n_splits=5, shuffle=True, random_state=0)
forest = RandomForestClassifier(n_estimators=20,max_features=None,criterion='entropy',random_state=2)
print("Cross-validation scores:\n{}".format(cross_val_score(forest, temp, y_train1, cv=kfold)))

Cross-validation scores:
[0.8581796  0.86128192 0.8583458  0.86045094 0.85790261]
