In [1]:
import pandas as pd 
import re
import nltk 
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np 
#train models 
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
import seaborn as sn
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE

ModuleNotFoundError: No module named 'imblearn'

In [None]:
# Setting up stats dataframe
import time
stats = pd.Dataframe(columns=['Algorithm','Label','Accurary','Precision','Recall','F1','Time'])

# Get the data

In [None]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')
train.head()

# Studying the data

## Searching for null values

In [None]:
print("Equals to empty", len(train.loc[train['ABSTRACT'] == ""]))
print("Equals to none", len(train.loc[train['ABSTRACT'] == None]))

In [None]:
# What is the most common category 
cols = train.columns.tolist() 
cols = cols[3:]

train_sum = train[cols].sum() 
fig = plt.figure(figsize=(10, 4))
ax = fig.add_axes([0,0,1,1])
ax.bar(cols, train_sum)
plt.show()

## Correlation matrix


In [None]:
sn.heatmap(train[cols].corr(), annot=True)

## Size of abstract text + title

In [None]:
# if there's a text really large, we might cut it's size 
len_abstract = [len(i.split(' ')) for i in train['ABSTRACT']]
len_title = [len(i.split(' ')) for i in train['TITLE']]
len_text = [len_title[i] + len_abstract[i] for i in range(len(train))]
x = [i for i in range(len(train))]

fig = plt.figure(figsize=(10, 4))
ax = fig.add_axes([0,0,1,1])
ax.scatter(x, len_text, s=0.4)
plt.show()

## Most frequent words 

In [None]:
from collections import Counter

mostUsed = pd.Series(' '.join(train['ABSTRACT']).lower().split()).value_counts()[:10]
mostUsedGraph = sn.barplot(mostUsed.index, mostUsed.values)
mostUsedGraph.set(xlabel="Words", ylabel="Occurrencies")

# Cleaning data

## Preprocessing words

In [None]:
def normalize(df): 
    corpus = []
    ps = PorterStemmer()
    df_size = len(df) 
    for i in range(0,df_size):
        # get review and remove non alpha chars
        article = re.sub('[^a-zA-Z]', ' ', df['ABSTRACT'][i])
        # to lower-case and tokenize
        article = article.lower().split()
        # stemming and stop word removal
        article = ' '.join([ps.stem(w) for w in article if not w in set(stopwords.words('english'))])
        corpus.append(article)
    return corpus

# To avoid making this cleaning every time, we save teh output as a csv
def save_csv(corpus_train, file_name): 
    df = pd.DataFrame({'text': corpus_train})
    df.to_csv(file_name) 
    
train['ABSTRACT'] = train['TITLE'] +train['ABSTRACT']
corpus_train = normalize(train)
save_csv(corpus_train, 'preprocessed_train.csv')

test['ABSTRACT'] = test['TITLE'] + test['ABSTRACT']
corpus_test = normalize(test)
save_csv(corpus_test, 'preprocessed_test.csv')


# Split input and output

In [None]:
def get_input(preprocessed, x_col_name): 
    vectorizer = CountVectorizer(max_features = 1500)

    X_array = preprocessed.loc[:,x_col_name]
    X = vectorizer.fit_transform(X_array).toarray()
    return X

def get_outputs(df, y_cols_name):
    y = []
    for col_name in y_cols_name: 
        col_values = df.loc[:,col_name].values
        y.append(col_values)
    return y


preprocessed = pd.read_csv('preprocessed_train.csv')
preprocessed_submission = pd.read_csv('preprocessed_test.csv')
y_columns = train.columns[3:]
X = get_input(preprocessed, 'text')
y = get_outputs(train, y_columns)
X_submission = get_input(preprocessed_submission, 'text')

# Split train and test sets

In [None]:
y_train_test = []
for i in range(len(y_columns)): 
    X_train, X_test, y_train, y_test = train_test_split(X, y[i], test_size = 0.20, random_state = 0)
    y_train_test.append([y_train, y_test])

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
print(y_columns)

# Oversampling 
Run this cell only with it's desired to run the oversamplying. 

In [None]:
sm = SMOTE(random_state=12)
quant_biology_train_y = y_train_test[4][0];
quant_finance_train_y = y_train_test[5][0];  

x_biology, y_train_test[4][0] = sm.fit_resample(X_train, quant_biology_train_y)
x_finance, y_train_test[5][0] = sm.fit_resample(X_train, quant_finance_train_y)


# Train the model and generate results

In [None]:
def print_statistics(y_test, y_pred, column_name): 
   # print(confusion_matrix(y_test, y_pred))
    print('Accuracy: ', accuracy_score(y_test, y_pred))
    print('Precision: ', precision_score(y_test, y_pred))
    print('Recall: ', recall_score(y_test, y_pred))
    print('F1: ', f1_score(y_test, y_pred))
    return [accuracy_score(y_test, y_pred),precision_score(y_test, y_pred),recall_score(y_test, y_pred),f1_score(y_test, y_pred)]

In [None]:
def get_x_train(index, with_oversampling):
    if index == 4 and with_oversampling: 
        return x_biology
    elif index == 5 and with_oversampling: 
        return x_finance
    return X_train 

In [None]:
classifier = GaussianNB()
gaussian_df = pd.DataFrame()
gaussian_submission_df = pd.DataFrame() 

for i in range(len(y_columns)): 
    col_name = y_columns[i]
    print("COL:", col_name)
    y_train = y_train_test[i][0]
    y_test = y_train_test[i][1]
    x_train = get_x_train(i) 
    
    begin = time.time()
    classifier.fit(x_train, y_train)
    end = time.time()
    gaussian_df[col_name]= classifier.predict(X_test, with_oversampling)
    gaussian_submission_df[col_name] = classifier.predict(X_submission)
    stat_array = print_statistics(y_test, gaussian_df[col_name].values, col_name)
    print()
    stats.append(['Naive Bayes',col_name,stat_array[0],stat_array[1],stat_array[2],stat_array[3],end-begin])

index_submission= test.loc[:, 'ID'].values
print(len(gaussian_submission_df))
gaussian_submission_df = gaussian_submission_df.set_index(pd.Index(index_submission), 'ID')
gaussian_df.index.name = "ID"    
print(gaussian_submission_df)
gaussian_df.to_csv("submission_gaussian.csv")

In [None]:
# SVM

from sklearn.svm import LinearSVC


classifier = LinearSVC()
gaussian_df = pd.DataFrame()
gaussian_submission_df = pd.DataFrame() 
for i in range(len(y_columns)): 
    col_name = y_columns[i]
    print("COL:", col_name)
    y_train = y_train_test[i][0]
    y_test = y_train_test[i][1]
    
    begin = time.time()
    classifier.fit(X_train, y_train)
    end = time.time()
    gaussian_df[col_name]= classifier.predict(X_test)
    gaussian_submission_df[col_name] = classifier.predict(X_submission)
    stat_array = print_statistics(y_test, gaussian_df[col_name].values, col_name)
    print()
    stats.append(['Support Vector Machines',col_name,stat_array[0],stat_array[1],stat_array[2],stat_array[3],end-begin])
    
    

classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred))
print('Recall: ', recall_score(y_test, y_pred))
print('F1: ', f1_score(y_test, y_pred))

In [None]:
# Logistic Regression

from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()
gaussian_df = pd.DataFrame()
gaussian_submission_df = pd.DataFrame() 
for i in range(len(y_columns)): 
    col_name = y_columns[i]
    print("COL:", col_name)
    y_train = y_train_test[i][0]
    y_test = y_train_test[i][1]
    
    begin = time.time()
    classifier.fit(X_train, y_train)
    end = time.time()
    gaussian_df[col_name]= classifier.predict(X_test)
    gaussian_submission_df[col_name] = classifier.predict(X_submission)
    stat_array = print_statistics(y_test, gaussian_df[col_name].values, col_name)
    print()
    stats.append(['Logistic Regression',col_name,stat_array[0],stat_array[1],stat_array[2],stat_array[3],end-begin])
    
y_pred = classifier.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred))
print('Recall: ', recall_score(y_test, y_pred))
print('F1: ', f1_score(y_test, y_pred))

In [None]:
# Perceptron

from sklearn.linear_model import Perceptron

classifier = Perceptron()
gaussian_df = pd.DataFrame()
gaussian_submission_df = pd.DataFrame() 
for i in range(len(y_columns)): 
    col_name = y_columns[i]
    print("COL:", col_name)
    y_train = y_train_test[i][0]
    y_test = y_train_test[i][1]
    
    begin = time.time()
    classifier.fit(X_train, y_train)
    end = time.time()
    gaussian_df[col_name]= classifier.predict(X_test)
    gaussian_submission_df[col_name] = classifier.predict(X_submission)
    stat_array = print_statistics(y_test, gaussian_df[col_name].values, col_name)
    print()
    stats.append(['Perceptron',col_name,stat_array[0],stat_array[1],stat_array[2],stat_array[3],end-begin])
    
y_pred = classifier.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred))
print('Recall: ', recall_score(y_test, y_pred))
print('F1: ', f1_score(y_test, y_pred))

In [None]:
# Decision Tree

from sklearn.tree import DecisionTreeClassifier

classifier = DecisionTreeClassifier()
gaussian_df = pd.DataFrame()
gaussian_submission_df = pd.DataFrame() 
for i in range(len(y_columns)): 
    col_name = y_columns[i]
    print("COL:", col_name)
    y_train = y_train_test[i][0]
    y_test = y_train_test[i][1]
    
    begin = time.time()
    classifier.fit(X_train, y_train)
    end = time.time()
    gaussian_df[col_name]= classifier.predict(X_test)
    gaussian_submission_df[col_name] = classifier.predict(X_submission)
    stat_array print_statistics(y_test, gaussian_df[col_name].values, col_name)
    print()
    stats.append(['Decision Tree',col_name,stat_array[0],stat_array[1],stat_array[2],stat_array[3],end-begin])
    
y_pred = classifier.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred))
print('Recall: ', recall_score(y_test, y_pred))
print('F1: ', f1_score(y_test, y_pred))

In [None]:
# Random Forest

from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier()
gaussian_df = pd.DataFrame()
gaussian_submission_df = pd.DataFrame() 
for i in range(len(y_columns)): 
    col_name = y_columns[i]
    print("COL:", col_name)
    y_train = y_train_test[i][0]
    y_test = y_train_test[i][1]
    
    begin = time.time()
    classifier.fit(X_train, y_train)
    end = time.time()
    gaussian_df[col_name]= classifier.predict(X_test)
    gaussian_submission_df[col_name] = classifier.predict(X_submission)
    stat_array = print_statistics(y_test, gaussian_df[col_name].values, col_name)
    print()
    stats.append(['Random Forest',col_name,stat_array[0],stat_array[1],stat_array[2],stat_array[3],end-begin])
    
y_pred = classifier.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred))
print('Recall: ', recall_score(y_test, y_pred))
print('F1: ', f1_score(y_test, y_pred))

In [None]:
# Export stats to csv
