In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import numpy as np
from pylab import rcParams
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, cohen_kappa_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score

from sklearn import datasets
from scipy.stats.stats import kendalltau

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
path = "/content/drive/MyDrive/Colab Notebooks/Balanced_data_hlh.csv"

### Read dataset file

In [None]:
df = pd.read_csv(path)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 432452 entries, 0 to 432451
Data columns (total 24 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   GLOBALEVENTID   432452 non-null  int64  
 1   SQLDATE         432452 non-null  int64  
 2   MonthYear       432452 non-null  int64  
 3   FractionDate    432452 non-null  float64
 4   IsRootEvent     432452 non-null  int64  
 5   EventCode       432452 non-null  int64  
 6   EventBaseCode   432452 non-null  int64  
 7   EventRootCode   432452 non-null  int64  
 8   GoldsteinScale  432452 non-null  float64
 9   NumMentions     432452 non-null  int64  
 10  NumSources      432452 non-null  int64  
 11  NumArticles     432452 non-null  int64  
 12  AvgTone         432452 non-null  float64
 13  Actor1Geo_Type  432452 non-null  int64  
 14  Actor1Geo_Lat   432452 non-null  float64
 15  Actor1Geo_Long  432452 non-null  float64
 16  Actor2Geo_Type  432452 non-null  int64  
 17  Actor2Geo_

In [None]:
df.isna().sum()

GLOBALEVENTID     0
SQLDATE           0
MonthYear         0
FractionDate      0
IsRootEvent       0
EventCode         0
EventBaseCode     0
EventRootCode     0
GoldsteinScale    0
NumMentions       0
NumSources        0
NumArticles       0
AvgTone           0
Actor1Geo_Type    0
Actor1Geo_Lat     0
Actor1Geo_Long    0
Actor2Geo_Type    0
Actor2Geo_Lat     0
Actor2Geo_Long    0
ActionGeo_Type    0
ActionGeo_Lat     0
ActionGeo_Long    0
DATEADDED         0
Target            0
dtype: int64

In [None]:
df.describe()

Unnamed: 0,GLOBALEVENTID,SQLDATE,MonthYear,FractionDate,IsRootEvent,EventCode,EventBaseCode,EventRootCode,GoldsteinScale,NumMentions,...,Actor1Geo_Lat,Actor1Geo_Long,Actor2Geo_Type,Actor2Geo_Lat,Actor2Geo_Long,ActionGeo_Type,ActionGeo_Lat,ActionGeo_Long,DATEADDED,Target
count,432452.0,432452.0,432452.0,432452.0,432452.0,432452.0,432452.0,432452.0,432452.0,432452.0,...,432452.0,432452.0,432452.0,432452.0,432452.0,432452.0,432452.0,432452.0,432452.0,432452.0
mean,756994100.0,20182270.0,201822.591344,2018.636053,0.661801,146.44346,117.337707,11.594459,-3.493815,6.430087,...,31.300501,-9.472748,2.851778,31.291028,-9.400206,2.852948,31.329376,-9.495303,20182320000000.0,0.453038
std,187746300.0,20968.03,209.680291,2.078385,0.473097,169.582182,11.343093,1.107417,1.620268,5.78612,...,21.428568,74.794642,1.279359,21.440508,74.801685,1.279315,21.436037,74.772469,20963980000.0,0.49779
min,410400400.0,20150100.0,201501.0,2015.0137,0.0,100.0,100.0,10.0,-7.5,1.0,...,-85.6221,-178.983,0.0,-85.6221,-178.983,0.0,-85.6221,-178.983,20150200000000.0,0.0
25%,589307200.0,20161010.0,201610.0,2016.7753,0.0,111.0,111.0,11.0,-5.0,2.0,...,24.9086,-79.8431,2.0,24.9086,-79.8431,2.0,24.998425,-79.8431,20161000000000.0,0.0
50%,769472000.0,20180700.0,201807.0,2018.4986,1.0,114.0,112.0,11.0,-4.0,5.0,...,37.5715,-0.116667,3.0,37.5664,-0.116667,3.0,37.669,-0.116667,20180700000000.0,0.0
75%,924458300.0,20200520.0,202005.0,2020.3781,1.0,128.0,120.0,12.0,-2.0,10.0,...,44.0407,37.6156,4.0,44.0,37.6156,4.0,44.0407,37.6156,20200500000000.0,1.0
max,1057500000.0,20220810.0,202208.0,2022.5918,1.0,1431.0,145.0,14.0,-2.0,418.0,...,80.3365,179.833,5.0,80.3365,179.833,5.0,80.3365,179.833,20220800000000.0,1.0


### Crop 2 years data in dataframe

In [None]:
# experiment the prediction of event code
df = df.loc[df['EventRootCode']<14]

df_cropped = df.loc[df['MonthYear'] >= 202001]
df_cropped = df_cropped.loc[df_cropped['MonthYear'] <= 202112]

X = df_cropped.iloc[:,0:23]
y = df_cropped['EventRootCode']



In [None]:
X.drop('EventRootCode', axis=1, inplace = True)
X

Unnamed: 0,GLOBALEVENTID,SQLDATE,MonthYear,FractionDate,IsRootEvent,EventCode,EventBaseCode,GoldsteinScale,NumMentions,NumSources,...,Actor1Geo_Type,Actor1Geo_Lat,Actor1Geo_Long,Actor2Geo_Type,Actor2Geo_Lat,Actor2Geo_Long,ActionGeo_Type,ActionGeo_Lat,ActionGeo_Long,DATEADDED
290167,896857612,20200105,202001,2020.0137,1,114,114,-2.0,4,1,...,4,-25.706900,28.229400,4,-25.706900,28.229400,4,-25.706900,28.229400,2.020010e+13
290168,896858922,20200105,202001,2020.0137,1,114,114,-2.0,2,1,...,4,-25.713000,28.365300,4,-25.713000,28.365300,4,-25.713000,28.365300,2.020010e+13
290169,896858923,20200105,202001,2020.0137,1,114,114,-2.0,2,1,...,4,-25.706900,28.229400,4,-25.706900,28.229400,4,-25.706900,28.229400,2.020010e+13
290170,896862194,20200105,202001,2020.0137,0,100,100,-5.0,10,1,...,1,60.000000,100.000000,1,60.000000,100.000000,1,60.000000,100.000000,2.020010e+13
290171,896864915,20200105,202001,2020.0137,0,110,110,-2.0,4,1,...,2,31.106000,-97.647500,2,31.106000,-97.647500,2,31.106000,-97.647500,2.020010e+13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
431482,1055815737,20210726,202107,2021.5644,1,112,112,-2.0,5,1,...,2,44.040700,-72.709300,2,44.040700,-72.709300,2,44.040700,-72.709300,2.022070e+13
431490,1055824525,20210726,202107,2021.5644,0,130,130,-4.4,10,1,...,1,42.833333,12.833333,1,42.833333,12.833333,1,42.833333,12.833333,2.022070e+13
431574,1055935018,20210727,202107,2021.5671,1,130,130,-4.4,5,1,...,1,54.000000,-4.000000,1,54.000000,-4.000000,1,54.000000,-4.000000,2.022070e+13
431716,1056157305,20210728,202107,2021.5699,1,112,112,-2.0,2,1,...,2,40.314000,-74.508900,2,40.314000,-74.508900,2,40.314000,-74.508900,2.022070e+13


In [None]:
target_balance = y.value_counts()
target_balance

11    51560
12    22197
13    20686
10    15595
Name: EventRootCode, dtype: int64

### Feature selection

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

model = RandomForestClassifier()
rfe = RFE(model, n_features_to_select=19)

fit = rfe.fit(X_train, y_train)

print("Num Features: %s" % (fit.n_features_))
print("Selected Features: %s" % (fit.support_))
print("Feature Ranking: %s" % (fit.ranking_))
selected_features = X.columns[(fit.get_support())]
print(selected_features)

Num Features: 19
Selected Features: [ True  True  True  True False  True  True  True  True False  True  True
  True  True  True False  True  True  True  True  True  True]
Feature Ranking: [1 1 1 1 2 1 1 1 1 4 1 1 1 1 1 3 1 1 1 1 1 1]
Index(['GLOBALEVENTID', 'SQLDATE', 'MonthYear', 'FractionDate', 'EventCode',
       'EventBaseCode', 'GoldsteinScale', 'NumMentions', 'NumArticles',
       'AvgTone', 'Actor1Geo_Type', 'Actor1Geo_Lat', 'Actor1Geo_Long',
       'Actor2Geo_Lat', 'Actor2Geo_Long', 'ActionGeo_Type', 'ActionGeo_Lat',
       'ActionGeo_Long', 'DATEADDED'],
      dtype='object')


In [None]:
#X.drop('EventRootCode',axis = 1, inplace = True)
X.drop('NumSources',axis = 1, inplace = True)
X.drop('Actor1Geo_Type',axis = 1, inplace = True)
X.drop('Actor2Geo_Type',axis = 1, inplace = True)
X

Unnamed: 0,GLOBALEVENTID,SQLDATE,MonthYear,FractionDate,IsRootEvent,EventCode,EventBaseCode,GoldsteinScale,NumMentions,NumArticles,AvgTone,Actor1Geo_Lat,Actor1Geo_Long,Actor2Geo_Lat,Actor2Geo_Long,ActionGeo_Type,ActionGeo_Lat,ActionGeo_Long,DATEADDED
290167,896857612,20200105,202001,2020.0137,1,114,114,-2.0,4,4,-4.861111,-25.706900,28.229400,-25.706900,28.229400,4,-25.706900,28.229400,2.020010e+13
290168,896858922,20200105,202001,2020.0137,1,114,114,-2.0,2,2,-4.861111,-25.713000,28.365300,-25.713000,28.365300,4,-25.713000,28.365300,2.020010e+13
290169,896858923,20200105,202001,2020.0137,1,114,114,-2.0,2,2,-4.861111,-25.706900,28.229400,-25.706900,28.229400,4,-25.706900,28.229400,2.020010e+13
290170,896862194,20200105,202001,2020.0137,0,100,100,-5.0,10,10,-1.033592,60.000000,100.000000,60.000000,100.000000,1,60.000000,100.000000,2.020010e+13
290171,896864915,20200105,202001,2020.0137,0,110,110,-2.0,4,4,-0.340716,31.106000,-97.647500,31.106000,-97.647500,2,31.106000,-97.647500,2.020010e+13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
431482,1055815737,20210726,202107,2021.5644,1,112,112,-2.0,5,5,-4.318182,44.040700,-72.709300,44.040700,-72.709300,2,44.040700,-72.709300,2.022070e+13
431490,1055824525,20210726,202107,2021.5644,0,130,130,-4.4,10,10,-4.746494,42.833333,12.833333,42.833333,12.833333,1,42.833333,12.833333,2.022070e+13
431574,1055935018,20210727,202107,2021.5671,1,130,130,-4.4,5,5,-7.142857,54.000000,-4.000000,54.000000,-4.000000,1,54.000000,-4.000000,2.022070e+13
431716,1056157305,20210728,202107,2021.5699,1,112,112,-2.0,2,2,-1.704545,40.314000,-74.508900,40.314000,-74.508900,2,40.314000,-74.508900,2.022070e+13


### Logistic regression

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

model = LogisticRegression()
model = model.fit(X_train,y_train)
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)
print("Train Accuracy for Logistic Regression Algorithm:",accuracy_score(y_pred_train, y_train))
print("Test Accuracy for Logistic Regression Algorithm:",accuracy_score(y_pred_test, y_test))

Train Accuracy for Logistic Regression Algorithm: 0.4686811314324662
Test Accuracy for Logistic Regression Algorithm: 0.4681025081788441


### Random forest

In [None]:
model = RandomForestClassifier()
model = model.fit(X_train,y_train)
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)
print("Train Accuracy for Random Forest Algorithm:",accuracy_score(y_pred_train, y_train))
print("Test Accuracy for Random Forest Algorithm:",accuracy_score(y_pred_test, y_test))

Train Accuracy for Random Forest Algorithm: 1.0
Test Accuracy for Random Forest Algorithm: 1.0


### Decision tree

In [None]:
model = DecisionTreeClassifier()
model = model.fit(X_train,y_train)
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)
print("Train Accuracy for Decision Tree Algorithm:",accuracy_score(y_pred_train, y_train))
print("Test Accuracy for Decision Tree Algorithm:",accuracy_score(y_pred_test, y_test))

Train Accuracy for Decision Tree Algorithm: 1.0
Test Accuracy for Decision Tree Algorithm: 1.0


### k-NN

In [None]:
model = KNeighborsClassifier()
model = model.fit(X_train,y_train)
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)
print("Train Accuracy for k-NN Algorithm:",accuracy_score(y_pred_train, y_train))
print("Test Accuracy for k-NN Algorithm:",accuracy_score(y_pred_test, y_test))

Train Accuracy for k-NN Algorithm: 0.38384641599454733
Test Accuracy for k-NN Algorithm: 0.37808978553253364


### XGBoost

In [None]:
# XG Boost
D_train = xgb.DMatrix(X_train, label=y_train)
D_test = xgb.DMatrix(X_test, label=y_test)
print(y.unique())
print(type(D_test))


[11 10 12 13]
<class 'xgboost.core.DMatrix'>


In [None]:
param = {
    'eta': 0.3, 
    'max_depth': 3,  
    'objective': 'multi:softprob',  
    'num_class': 19}
steps = 25

model = xgb.train(param, D_train, steps)
y_pred_train = model.predict(D_train)
y_pred_test = model.predict(D_test)
best_pred_train = np.asarray([np.argmax(line) for line in y_pred_train])
best_pred_test = np.asarray([np.argmax(line) for line in y_pred_test])

print("Training Accuracy for XGBoost algorithm:",accuracy_score(y_train, best_pred_train))
print("Testing Accuracy for XGBoost algorithm:",accuracy_score(y_test, best_pred_test))