In [1]:
!pip install texthero

Collecting texthero
  Downloading https://files.pythonhosted.org/packages/1f/5a/a9d33b799fe53011de79d140ad6d86c440a2da1ae8a7b24e851ee2f8bde8/texthero-1.0.9-py3-none-any.whl
Collecting unidecode>=1.1.1
[?25l  Downloading https://files.pythonhosted.org/packages/9e/25/723487ca2a52ebcee88a34d7d1f5a4b80b793f179ee0f62d5371938dfa01/Unidecode-1.2.0-py2.py3-none-any.whl (241kB)
[K     |████████████████████████████████| 245kB 6.5MB/s 
Collecting nltk>=3.3
[?25l  Downloading https://files.pythonhosted.org/packages/92/75/ce35194d8e3022203cca0d2f896dbb88689f9b3fce8e9f9cff942913519d/nltk-3.5.zip (1.4MB)
[K     |████████████████████████████████| 1.4MB 9.1MB/s 
Building wheels for collected packages: nltk
  Building wheel for nltk (setup.py) ... [?25l[?25hdone
  Created wheel for nltk: filename=nltk-3.5-cp37-none-any.whl size=1434672 sha256=182c9dd9d9921aabea15223e8ac5d004cc74ce66be1966813294c8bdc9c6b51b
  Stored in directory: /root/.cache/pip/wheels/ae/8c/3f/b1fe0ba04555b08b57ab52ab7f86023639a5

In [2]:
# Handling datasets 
import pandas as pd

# Some encoding and preprocessing textual data
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import texthero
from texthero import preprocessing

# Set of ML classifiers to be used
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
def preprocess(data):
    data = data.copy()
    #removing diacritics
    data['text'] = preprocessing.remove_diacritics(data['text'])
    return data

In [4]:
# Read the training and testing datasets
train_df = pd.read_csv('Train.csv')
test_df = pd.read_csv('Test.csv')

In [5]:
# Preprocessing
train = preprocess(train_df)
test = preprocess(test_df)

In [6]:
# Building a vextorizer object that allow us to transform the texts into a numerical representation (vector)
# and fit it on the training pieces of texts
vectorizer = TfidfVectorizer(ngram_range=(1, 3)).fit(train['text'].values)
len(vectorizer.vocabulary_)

1087890

In [7]:
# Split the training dataset into training and validation
X_train, X_val, y_train, y_val = train_test_split(train, train['label'], test_size=0.2, random_state=0,stratify=train['label'])

In [8]:
# Transform training, validation and testing texts into vectors
X_train = vectorizer.transform(X_train['text'].values)
X_val = vectorizer.transform(X_val['text'].values)
X_test = vectorizer.transform(test['text'].values)

You can try to tweak the model's parameters in order to get better performances I used the default setting

# Logistic Regression

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

In [9]:
clf1 = LogisticRegression(random_state=0)
model1 = clf1.fit(X_train, y_train)
pred1 = model1.predict(X_val)
print(classification_report(y_val,pred1,digits=4))

              precision    recall  f1-score   support

          -1     0.7357    0.8145    0.7731      5859
           0     1.0000    0.0264    0.0514       493
           1     0.8274    0.8115    0.8193      7648

    accuracy                         0.7851     14000
   macro avg     0.8544    0.5508    0.5479     14000
weighted avg     0.7951    0.7851    0.7729     14000




lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



# SVM

https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html

In [10]:
clf2 = svm.SVC(C=2)
model2 = clf2.fit(X_train, y_train)
pred2 = model2.predict(X_val)
print(classification_report(y_val,pred2,digits=4))

              precision    recall  f1-score   support

          -1     0.7371    0.8307    0.7811      5859
           0     0.9310    0.0548    0.1034       493
           1     0.8393    0.8086    0.8237      7648

    accuracy                         0.7913     14000
   macro avg     0.8358    0.5647    0.5694     14000
weighted avg     0.7998    0.7913    0.7805     14000



# Random Forest

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [11]:
clf3 = RandomForestClassifier()
model3 = clf3.fit(X_train, y_train)
pred3 = model3.predict(X_val)
print(classification_report(y_val,pred3,digits=4))

              precision    recall  f1-score   support

          -1     0.7281    0.7247    0.7264      5859
           0     0.7206    0.0994    0.1747       493
           1     0.7707    0.8163    0.7929      7648

    accuracy                         0.7527     14000
   macro avg     0.7398    0.5468    0.5646     14000
weighted avg     0.7511    0.7527    0.7433     14000



# XGBoost

https://xgboost.readthedocs.io/en/latest/

In [12]:
clf4 = XGBClassifier()
model4 = clf4.fit(X_train, y_train)
pred4 = model4.predict(X_val)
print(classification_report(y_val,pred4,digits=4))

              precision    recall  f1-score   support

          -1     0.7195    0.4554    0.5578      5859
           0     0.7778    0.0284    0.0548       493
           1     0.6529    0.8771    0.7486      7648

    accuracy                         0.6707     14000
   macro avg     0.7167    0.4536    0.4537     14000
weighted avg     0.6852    0.6707    0.6443     14000



# LightGBM

https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html

In [13]:
clf5 = LGBMClassifier()
model5 = clf5.fit(X_train, y_train)
pred5 = model5.predict(X_val)
print(classification_report(y_val,pred5,digits=4))

              precision    recall  f1-score   support

          -1     0.6605    0.8396    0.7394      5859
           0     0.6508    0.0832    0.1475       493
           1     0.8365    0.7099    0.7680      7648

    accuracy                         0.7421     14000
   macro avg     0.7159    0.5442    0.5516     14000
weighted avg     0.7563    0.7421    0.7342     14000



# Make Prediction and create a submission file to submit it

In [14]:
# Predict on the test dataset based on one of your models SVM, RandomForest, XGBoost,...
pred = model2.predict(X_test) # I used SVM classifier

# Create the submission files with IDs and the prediction of your model
test_df['label'] = pred
submit = test_df[["ID","label"]]
submit.to_csv("starter_submission.csv", index=False)