#### PGGM Bootcamp Text Analytics 2020
*Notebook by [Pedro V Hernandez Serrano](https://github.com/pedrohserrano)*

---
![](images/3_2.png)

# 3.2 Supervised ML Classifiers
* [3.2.1. Core dataset](#3.2.1)
* [3.2.2. Classifiers](#3.2.2)

---

In [1]:
import pandas as pd
import numpy as np

---
### 3.2.1. Core dataset
<a id="3.2.1">

In [8]:
# Read in the data
dataset = pd.read_csv('datasets/table_text_features.csv', index_col=0)

#df = df.sample(frac=0.1, random_state=10)
# Drop missing values
dataset.dropna(inplace=True)

In [10]:
dataset.head()

Unnamed: 0,company_name,count_tokens,average_sentence_length,percentage_complex_word,positive_score,negative_score,uncertainty_score,constraining_score
ABN_AMRO_Group_(2018).pdf,ABN_AMRO_Group,106119,106119,0.304931,834,1401,1728,686
AGNC_Investment_(2018).pdf,AGNC_Investment,47122,47122,0.301091,297,858,963,412
A_G_Barr_(2018).pdf,A_G_Barr,54311,54311,0.272947,662,461,457,230
Aboitiz_Power_(2018).pdf,Aboitiz_Power,68940,68940,0.261053,262,529,425,224
Acer_(2018).pdf,Acer,89820,89820,0.327533,684,1134,572,443


In [11]:
dataset.describe()

Unnamed: 0,count_tokens,average_sentence_length,percentage_complex_word,positive_score,negative_score,uncertainty_score,constraining_score
count,92.0,92.0,92.0,92.0,92.0,92.0,92.0
mean,65845.902174,65845.902174,0.287568,496.98913,743.967391,655.043478,333.108696
std,38649.860479,38649.860479,0.02318,318.515392,544.926963,520.477737,244.290966
min,3446.0,3446.0,0.219561,42.0,26.0,13.0,5.0
25%,39658.0,39658.0,0.274329,263.5,380.75,326.0,142.0
50%,61156.0,61156.0,0.288838,425.5,624.5,584.0,299.0
75%,80126.0,80126.0,0.302842,625.75,954.5,832.25,443.75
max,195942.0,195942.0,0.344767,1417.0,3246.0,3470.0,1340.0


In [12]:
#np.where(df['uncertainty_score'] > 3, 1, 0)
dataset['invest'] = [1 if i < dataset.uncertainty_score.median() else 0 for i in dataset.uncertainty_score]

In [14]:
dataset['invest'].mean()

0.5

In [16]:
dataset.loc[:,'invest'].values

array([0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 0, 1, 0])

In [32]:
dataset.head()

Unnamed: 0,company_name,count_tokens,average_sentence_length,percentage_complex_word,positive_score,negative_score,uncertainty_score,constraining_score,invest
ABN_AMRO_Group_(2018).pdf,ABN_AMRO_Group,106119,106119,0.304931,834,1401,1728,686,0
AGNC_Investment_(2018).pdf,AGNC_Investment,47122,47122,0.301091,297,858,963,412,0
A_G_Barr_(2018).pdf,A_G_Barr,54311,54311,0.272947,662,461,457,230,1
Aboitiz_Power_(2018).pdf,Aboitiz_Power,68940,68940,0.261053,262,529,425,224,1
Acer_(2018).pdf,Acer,89820,89820,0.327533,684,1134,572,443,1


---
### 3.2.1. Classifiers
<a id="3.2.1">

We invoke the popolar libraries  for Machine learning

In [17]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas, xgboost, numpy, textblob, string

In [21]:
# Importing the dataset
X = dataset.loc[:, ['count_tokens','constraining_score', 'negative_score']].values
y = dataset.loc[:,'invest'].values

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

# label encode the target variable 
encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)



In [22]:
def train_model(classifier, feature_vector_train, label, feature_vector_test, label_test):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_test)
    return metrics.accuracy_score(predictions, label_test)

In [23]:
accuracy = train_model(ensemble.RandomForestClassifier(), X_train, y_train, X_test, y_test)
print ("RF, Selected variables: {}".format(accuracy))

RF, Selected variables: 0.9565217391304348




---