In [2]:
#Importing the required modules for NLP
import pandas as pd
import numpy as np # linear algebra
import re
from collections import Counter
import gensim
from gensim.utils import simple_preprocess #text processing
from gensim.models import Word2Vec
import nltk
from nltk.tokenize import word_tokenize
#nltk.download('wordnet')
import string
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
stopwords = stopwords.words('portuguese')



In [3]:
#Getting the data from source excel
train_df = pd.read_excel('Thinkcol_case_study.xlsx',sheet_name='train')
test_df = pd.read_excel('Thinkcol_case_study.xlsx',sheet_name='predict')

### Doing basic exploration on data

In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1743 entries, 0 to 1742
Data columns (total 4 columns):
ID          1743 non-null int64
Mention     1743 non-null object
Target      1743 non-null int64
Category    1743 non-null object
dtypes: int64(2), object(2)
memory usage: 54.5+ KB


In [5]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48415 entries, 0 to 48414
Data columns (total 4 columns):
id            48415 non-null int64
Message ID    48415 non-null int64
Date          48415 non-null datetime64[ns]
Mention       48385 non-null object
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 1.5+ MB


In [6]:
test_df.dropna(axis = 0, subset=['Mention'],inplace =True)

In [7]:
train_df['Target'].value_counts() #imbalanced dataset

0    1666
1      77
Name: Target, dtype: int64

In [8]:
punctuations = " ".join(string.punctuation).split(" ") + ["-", "...", "”", "”","😍","❤️","😐","✌️","❤","👊","✋","📱📲","👌","🚂","📱➡️"]

In [9]:
#getting all the reviews in a list and preparing predictors and target
X=train_df.loc[:,"Mention"].copy()
y=train_df.loc[:,"Target"].copy()
test=train_df.loc[:,"Mention"].copy()
# Splitting the train data into train and validation data
from  sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.33, random_state=42,stratify = y)

In [10]:
#read a customized stopwords file for portugese language to remove unwanted words
txt = pd.read_csv('stopwords_portuguese.txt', sep=" ", header=None)
newstoplist=txt[0].tolist()
stopwords.extend(newstoplist)

In [11]:
# This function will handle cleaning the text, tokenization and lemmatization
def Pre_processing(text):
    corpus_raw=text.copy()
    corpus=list(corpus_raw.values)
    sentences =  list(filter(None, corpus))
    #doing some simple preprocessing
    process_list=[]
    for sentence in sentences:
        try:
            process_list.append(simple_preprocess(sentence))
        except:
            pass
    
    new_list=[]  
    #Tokenization and removal of stopwords and punctuations
    for sent in process_list:
        newsent=" ".join(sent)
        word_tokens = word_tokenize(newsent)
        filtered_sentence = [w for w in word_tokens if w not in stopwords and w not in punctuations]
        #print(filtered_sentence)
        new_list.append(filtered_sentence)
        
    #introducing lemmatization (to remove inflectional endings only and to return the base using vocabulary)
    lemma = WordNetLemmatizer()
    new_list2=[]
    for sent in new_list:
        normalized = " ".join(lemma.lemmatize(word,'n').lower() for word in sent)
        x = normalized.split()
        y = [s for s in x if len(s) > 2]
        new_list2.append(y)   
     #Preparing for vectorization
    text =[]
    for sent in new_list2:
        a = " ".join(word for word in sent )
        text.append(a)
    return  text   

In [12]:
# TF-IDF : To give more weightage to the important rather than frequent words like The, is ...
tfidf_vectorizer = TfidfVectorizer(input = Pre_processing ,stop_words=stopwords)

In [24]:
# Creating dataframe to store the evaluation metrics for each machine learning algorithm
Eval_metrics = pd.DataFrame(columns= ['Model','Precision','Recall','F1-score'])

In [14]:
#Help function for evaluation metrics
def classification_report_df(report, name):
    dict = {}
    lines = report.split('\n')
    row_data = lines[-2].split('      ')
    dict = {'Model':name,'Precision' : float(row_data[1]),'Recall' : float(row_data[2]),'F1-score' : float(row_data[3]) }
    return dict

In [15]:
# Importing necessary modules and classification models
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import accuracy_score

In [21]:
# Spot Check Algorithms
models = []
models.append(('Logistic Regression', LogisticRegression()))
models.append(('XGB', XGBClassifier()))
models.append(('LGB', LGBMClassifier()))
models.append(('NB', MultinomialNB()))
models.append(('SVM', SVC()))
# evaluate each model in turn
names = []

In [25]:
# Training the data for different machine learning classification algorithms
for name , clf in models:
    pipe = Pipeline([('data', tfidf_vectorizer), ('clf', clf)])
    # training
    pipe.fit(X_train, y_train)
    # validation test
    preds = pipe.predict(X_test)
    #Evaluating the model
    report = metrics.classification_report(y_test, preds)
    Evaluations = classification_report_df(report,name)
    Eval_metrics = Eval_metrics.append(Evaluations,ignore_index= True)        
    msg = "%s: %f" % (name, accuracy_score(y_test, preds))
    print(msg)

  'precision', 'predicted', average, warn_for)


Logistic Regression: 0.956597


  if diff:


XGB: 0.986111


  if diff:
  'precision', 'predicted', average, warn_for)


LGB: 0.984375
NB: 0.956597
SVM: 0.956597


  'precision', 'predicted', average, warn_for)


In [26]:
# Checking the metrics for each classification model
Eval_metrics

Unnamed: 0,Model,Precision,Recall,F1-score
0,Logistic Regression,0.92,0.96,0.94
1,XGB,0.99,0.99,0.98
2,LGB,0.98,0.98,0.98
3,NB,0.92,0.96,0.94
4,SVM,0.92,0.96,0.94


In [29]:
#From the above evaluation metrics using XGBoost as best model and training the data only using XGboost
clf = XGBClassifier()
pipe = Pipeline([('data', tfidf_vectorizer), ('clf', clf)])
# training the data
pipe.fit(X_train, y_train)
# validating the data
preds = pipe.predict(X_test)
print(metrics.classification_report(y_test, preds))
msg = "%s: %f" % ('accuracy', accuracy_score(y_test, preds))
print(msg)

             precision    recall  f1-score   support

          0       0.99      1.00      0.99       551
          1       1.00      0.68      0.81        25

avg / total       0.99      0.99      0.98       576

acurracy: 0.986111


  if diff:


In [30]:
# Predicting the target variable for main test data
predictions = pipe.predict(test)

  if diff:


In [35]:
predictions

array([1, 1, 1, ..., 0, 0, 0], dtype=int64)

#### Just looking the test data predictions to cross verify manually

In [36]:
np.where(predictions == 0)[0]

array([   3,    5,   19, ..., 1740, 1741, 1742], dtype=int64)

In [37]:
test = test.tolist()

In [38]:
test[1740:1743] # pred == 0

['\n\nSacanagem isso estão parecendo a Apple lançam aparelhos e param de dar suporte logo em seguida',
 '  de  un  reproductor  de  pelcula  donde  se  Nesta moldura ou suporte de placa existem dois flashes de xennio embutidos e um sensor de flash   um Elemento em uma Moto mandou eu parar  como no Parei  Isso frequentemente acontece quando deixamos o nosso celular no silencioso ou em um toque   voc ser convidado a ensinar como o Call Flash',
 'estou com o mesmo problema no meu moto g5 platinum você conseguiu resolver isso na assistencia quais foram os procedimentos ']

In [39]:
test[:3] # pred == 1

['moto G5 plus azul safira e com apenas 1 semana de uso o mesmo apresentou defeito o CHIP 2 não tem sinal e pela pesquisa que fiz este defeito é característico do moto g então entrei em contato com a Motorola para resolução do meu problema através do chat a mesma passou alguns procedimentos que não deram certo e então solicitaram que eu enviasse o celular para a assistência técnica que fica 400km da',
 ' Levei em uma autorizada fora da minha cidade e não conseguiram encontrar o defeito',
 'Comprei um lenovo k6 plus em maio desse ano e o mesmo já apresentou defeito os botões home multitarefa e voltar pararam de funcionar e para piorar não tem uma assistência técnica da motorola em Joinville SC é muito chato comprar um celular que com 5 meses apresenta defeito totalmente decepcionado com a marca']