In [1]:
#Importing the required modules for NLP
import pandas as pd
import numpy as np # linear algebra
import seaborn as sns
import matplotlib.pyplot as plt
import string
import re
from collections import Counter
from nltk.corpus import stopwords
stopwords = stopwords.words('portuguese')

In [2]:
#Getting the data from source excel
train_df = pd.read_excel('Thinkcol_case_study.xlsx',sheet_name='train')
test_df = pd.read_excel('Thinkcol_case_study.xlsx',sheet_name='predict')

#### Basic data exploration

In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1743 entries, 0 to 1742
Data columns (total 4 columns):
ID          1743 non-null int64
Mention     1743 non-null object
Target      1743 non-null int64
Category    1743 non-null object
dtypes: int64(2), object(2)
memory usage: 54.5+ KB


In [4]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48415 entries, 0 to 48414
Data columns (total 4 columns):
id            48415 non-null int64
Message ID    48415 non-null int64
Date          48415 non-null datetime64[ns]
Mention       48385 non-null object
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 1.5+ MB


In [5]:
# Drop NaN values from test data
test_df.dropna(axis = 0, subset=['Mention'],inplace =True)

In [6]:
train_df['Target'].value_counts() #imbalanced dataset

0    1666
1      77
Name: Target, dtype: int64

In [7]:
import spacy

In [8]:
# loading spacy module for Portugese language
nlp = spacy.load('pt')

In [9]:
#Selecting the predictors and target variables
X = train_df['Mention']
y = train_df['Target']

In [10]:
# Splitting the train data into train and validation data
from  sklearn.model_selection import train_test_split
train,test = train_test_split(train_df, test_size=0.33, random_state=42)

In [11]:
punctuations = string.punctuation

In [12]:
train.head()

Unnamed: 0,ID,Mention,Target,Category
341,3906756,COMPREI UM CEL DA MOTOROLA MOTOG4 PLUS E O APA...,0,Service Location
147,4281157,NA IDA SEGUINTE ME DISSERAM QUE FOI TROCADO O ...,0,Service Location
360,3948439,E não bastando esse transtorno de ficar quase...,0,Service Location
83,5760094,DOIS MESES DEPOIS O CELULAR MOTO G 5 S PLUS N...,0,Service Location
48,4201964,AI ELES ME DISSERAM PARA EU LEVAR O APARELHO...,0,Service Location


In [13]:
# # Define function to cleanup text by removing personal pronouns, stopwords, and punctuations
# def cleanup_text(docs, logging=False):
#     texts = []
    
#     for doc in docs:
#         doc = nlp(doc, disable=['parser', 'ner'])
#         tokens = [tok.lemma_.lower().strip() for tok in doc]
#         tokens = [tok for tok in tokens if tok not in stopwords and tok not in punctuations]
#         tokens = ' '.join(tokens)
#         texts.append(tokens)
#     return pd.Series(texts)

In [14]:
# INFO_text = [text for text in train['Mention']]

# INFO_clean = cleanup_text(INFO_text)
# INFO_clean = ' '.join(INFO_clean).split()

In [15]:
# Importing necessary modules and machine learning classification models 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import accuracy_score
import re
from spacy.lang.pt import Portuguese
parser = Portuguese()

In [16]:
punctuations = " ".join(string.punctuation).split(" ") + ["-", "...", "”", "”","😍","❤️","😐","✌️","❤","👊","✋","📱📲","👌","🚂","📱➡️"]

In [17]:
# Cleaning the text
class CleanTextTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        return [cleanText(text) for text in X]
    def fit(self, X, y=None, **fit_params):
        return self
    def get_params(self, deep=True):
        return {}
    
def cleanText(text):
    if type(text)=='str':
        text = text.strip().replace("\n", " ").replace("\r", " ")
        text = text.lower()
    else:
        text = str(text)
        
    return text

In [18]:
# Tokenization and lemmatization by removing personal pronouns, stopwords, and punctuations
def tokenizeText(sample):
    tokens = parser(sample)
    lemmas = []
    for tok in tokens:
        #if tok.isalpha():
        lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_)
    tokens = lemmas
    tokens = [tok for tok in tokens if tok not in stopwords and tok not in punctuations]
    return tokens

In [19]:
import emoji 
def give_emoji_free_text(text):
    allchars = [str for str in text.decode('utf-8')]
    emoji_list = [c for c in allchars if c in emoji.UNICODE_EMOJI]
    clean_text = ' '.join([str for str in text.decode('utf-8').split() if not any(i in str for i in emoji_list)])
    return clean_text

In [20]:
# Spot Check Algorithms
models = []
models.append(('Logistic Regression', LogisticRegression()))
models.append(('XGB', XGBClassifier()))
models.append(('NB', MultinomialNB()))
models.append(('SVM', SVC()))
# evaluate each model in turn
names = []

In [21]:
# Count vectorization
vectorizer = CountVectorizer(tokenizer=tokenizeText, ngram_range=(1,1))
# Preparing the data for train and validation by selecting the predictor and target variables
train1 = train['Mention'].tolist() # train data
labelsTrain1 = train['Target'].tolist()

test1 = test['Mention'].tolist() #validation data
labelsTest1 = test['Target'].tolist()

In [22]:
# Creating dataframe to maintain evaluation metrics for different algorithms
Eval_metrics = pd.DataFrame(columns= ['Model','Precision','Recall','F1-score'])

In [23]:
# Help function for evaluation metrics data
def classification_report_df(report, name):
    dict = {}
    lines = report.split('\n')
    row_data = lines[-2].split('      ')
    dict = {'Model':name,'Precision' : float(row_data[1]),'Recall' : float(row_data[2]),'F1-score' : float(row_data[3]) }
    return dict

In [24]:
# Training and validating the data for different machine learning classification algorithms
for name , clf in models:
    pipe = Pipeline([('cleanText', CleanTextTransformer()), ('vectorizer', vectorizer), ('clf', clf)])
    # training
    pipe.fit(train1, labelsTrain1)
    # validation testing
    preds = pipe.predict(test1)
    #Evaluation metrics for model
    report = metrics.classification_report(labelsTest1, preds)
    Evaluations = classification_report_df(report,name)
    Eval_metrics = Eval_metrics.append(Evaluations,ignore_index= True)        
    msg = "%s: %f" % (name, accuracy_score(labelsTest1, preds))
    print(msg)


Logistic Regression: 0.980903


  if diff:


XGB: 0.975694
NB: 0.968750


  'precision', 'predicted', average, warn_for)


SVM: 0.968750


In [25]:
# Checking the metrics for each classification model
Eval_metrics

Unnamed: 0,Model,Precision,Recall,F1-score
0,Logistic Regression,0.98,0.98,0.98
1,XGB,0.97,0.98,0.97
2,NB,0.96,0.97,0.96
3,SVM,0.94,0.97,0.95


In [26]:
#From the above evaluation metrics using Logisticregression as best model and training the data only using Logisticregression
pipe = Pipeline([('cleanText', CleanTextTransformer()), ('vectorizer', vectorizer), ('clf', LogisticRegression())])
# training the data
pipe.fit(train1, labelsTrain1)
# validating the data
preds = pipe.predict(test1)
msg = "%s: %f" % ('accuracy', accuracy_score(labelsTest1, preds))
print(msg)

accuracy: 0.980903


In [27]:
# Preparing test data
final_test = test_df['Mention'].tolist()

In [28]:
# Predicting the target variable for test data
predictions = pipe.predict(final_test)