In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## import dataset

In [3]:
data = pd.read_csv('../artifacts/ds.csv')

In [4]:
data.head()

## Data Preprocessing

In [5]:
data.shape

#### remove duplicates

In [8]:
data.duplicated().sum()

In [9]:
duplicates = data[data.duplicated()]
print(duplicates)


In [10]:
data = data.drop_duplicates()
print(f"Dataset size after removing duplicates: {data.shape}")


In [14]:
print(f"Remaining duplicates: {data.duplicated().sum()}")


#### checking null values

In [15]:
data.isnull().sum()

### Text Preprocessing

##### convert upercase to lowercase

In [16]:
import re
import string

In [33]:
data["feedback"].head(5)

In [34]:
data = data.copy()
data["feedback"] = data["feedback"].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [35]:
data["feedback"].head()

##### remove links

In [36]:
data["feedback"] = data["feedback"].apply(lambda x: " ".join(re.sub(r'^https?:\/\/.*[\r\n]*', '', x, flags=re.MULTILINE) for x in x.split()))

In [38]:
data["feedback"]

##### remove punctuations

In [39]:
string.punctuation

In [40]:
def remove_punctuation(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

data["feedback"] = data["feedback"].apply(remove_punctuation)

In [41]:
data["feedback"].head(5)

##### remove numbers

In [55]:
data["feedback"].iloc[90]


In [58]:
data["feedback"] = data["feedback"].str.replace(r'\d+', '', regex=True)

In [59]:
data["feedback"].iloc[90]

##### remove stopwords

In [60]:
!pip install nltk

In [61]:
import nltk

##### downlad stopwords for english language

In [62]:
nltk.download('stopwords',download_dir='../static/model')

In [63]:
with open('../static/model/corpora/stopwords/english', 'r') as file:
    sw=file.read().splitlines()

In [64]:
sw

In [65]:
data["feedback"] = data["feedback"].apply(lambda x: " ".join(x for x in x.split() if x not in sw))

In [67]:
data["feedback"].head(5)

##### stemming

In [68]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [69]:
data["feedback"] = data["feedback"].apply(lambda x: " ".join([ps.stem(x) for x in x.split()]))

In [72]:
data

## Vectorization

##### Bulidng Vocabulary

In [83]:
from collections import Counter
vocab=Counter()

In [84]:
vocab

In [88]:
for sentence in data["feedback"]:
    vocab.update(sentence.split())

In [90]:
len(vocab)

In [91]:
data.shape

In [120]:
vocab

##### creating vocabulary

In [118]:
tokens=[key for key in vocab if vocab[key]>20]

In [119]:
len(tokens)

##### save vocabulary

In [121]:
def save_vocubulary(lines, filename):
    data='\n'.join(lines)
    file=open(filename,'w',encoding='utf-8')
    file.write(data)
    file.close()

save_vocubulary(tokens,'../static/model/vocabulary.txt')

### Divide Dataset

In [126]:
X=data["feedback"]
Y=data["sentiment (binary)"]
Z=data["category (binary)"]

In [127]:
X

In [128]:
Y

In [129]:
Z

In [130]:
!pip install scikit-learn

In [132]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test, Z_train, Z_test = train_test_split(X, Y, Z, test_size=0.2)

In [139]:
X_train.shape

In [136]:
len(X_test)

In [137]:
len(Y_train)

In [138]:
len(Y_test)

In [140]:
len(Z_train)

In [141]:
len(Z_test)

### Vectorization

In [142]:
def vectorizer(ds,vocabulary):
    vectorized_list=[]

    for sentence in ds:
        sentence_list=np.zeros(len(vocabulary))
        for i in range(len(vocabulary)):
            if vocabulary[i] in sentence.split():
                sentence_list[i]=1
        vectorized_list.append(sentence_list)

    vectorized_list_new=np.asarray(vectorized_list,dtype=np.float32)
    return vectorized_list_new
    

##### vextorized the training feedbacks

In [150]:
vectorized_X_train=vectorizer(X_train,tokens)

In [151]:
for i in vectorized_X_train[0]:
    print(i)

In [152]:
vectorized_X_train[0]

##### vectorize the test feedbacks

In [153]:
vectorized_X_test=vectorizer(X_test,tokens)

In [154]:
vectorized_X_test

In [155]:
vectorized_X_train

In [157]:
Y_train

In [158]:
Y_train.value_counts()

In [160]:
plt.pie(np.array([Y_train.value_counts()[0],Y_train.value_counts()[1],Y_train.value_counts()[10]]),labels=['positive','negative','neutral'])
plt.show()

In [161]:
Z_train.value_counts()

In [163]:
plt.pie(np.array([Z_train.value_counts()[0],Z_train.value_counts()[1],Z_train.value_counts()[10]]),labels=['claim','service','policy'])
plt.show()

### Handle imbalance dataset

In [164]:
!pip install imbalanced-learn

In [173]:
from imblearn.over_sampling import SMOTE
smote=SMOTE()
vectorized_X_train_smote,Y_train_smote=smote.fit_resample(vectorized_X_train,Y_train)
print(vectorized_X_train_smote.shape,Y_train_smote.shape)

In [174]:
Y_train_smote.value_counts()

In [175]:
from imblearn.over_sampling import SMOTE
smote=SMOTE()
vectorized_X_train_smote_category,Z_train_smote=smote.fit_resample(vectorized_X_train,Z_train)
print(vectorized_X_train_smote_category.shape,Z_train_smote.shape)

In [177]:
Z_train_smote.value_counts()

In [178]:
plt.pie(np.array([Y_train_smote.value_counts()[0],Y_train_smote.value_counts()[1],Y_train_smote.value_counts()[10]]),labels=['positive','negative','neutral'])
plt.show()

In [179]:
plt.pie(np.array([Z_train_smote.value_counts()[0],Z_train_smote.value_counts()[1],Z_train_smote.value_counts()[10]]),labels=['claim','service','policy'])
plt.show()

##### using this dataset for train & test the sentiment models

In [184]:
vectorized_X_train_smote

In [185]:
Y_train_smote

In [186]:
vectorized_X_test

In [187]:
Y_test

##### using this dataset for train & test the models for categorization

In [188]:
vectorized_X_train_smote_category

In [189]:
Z_train_smote

In [190]:
vectorized_X_test

In [192]:
Z_test

## Model Training & Evaluation 

In [194]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

### Model Training & Evaluation for Sentiment

In [201]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def training_scores(y_act, y_pred):
    acc = round(accuracy_score(y_act, y_pred), 3)
    prec = round(precision_score(y_act, y_pred, average='weighted'), 3)  # Specify average
    rec = round(recall_score(y_act, y_pred, average='weighted'), 3)  # Specify average
    f1 = round(f1_score(y_act, y_pred, average='weighted'), 3)  # Specify average
    
    print(f'Training Scores:\n\tAccuracy: {acc}\n\tPrecision: {prec}\n\tRecall: {rec}\n\tF1-Score: {f1}\n')

def validation_scores(y_act, y_pred):
    acc = round(accuracy_score(y_act, y_pred), 3)
    prec = round(precision_score(y_act, y_pred, average='weighted'), 3)  # Specify average
    rec = round(recall_score(y_act, y_pred, average='weighted'), 3)  # Specify average
    f1 = round(f1_score(y_act, y_pred, average='weighted'), 3)  # Specify average
    
    print(f'Testing Scores:\n\tAccuracy: {acc}\n\tPrecision: {prec}\n\tRecall: {rec}\n\tF1-Score: {f1}\n')


##### logistic regression

In [202]:
lr=LogisticRegression()
lr.fit(vectorized_X_train_smote,Y_train_smote)

In [203]:
Y_train_predict=lr.predict(vectorized_X_train_smote)

In [204]:
Y_train_predict

In [205]:
Y_train_smote

In [206]:
training_scores(Y_train_smote,Y_train_predict)

In [207]:
Y_test_predict=lr.predict(vectorized_X_test)

In [234]:
Y_test_predict

In [235]:
Y_test

In [236]:
validation_scores(Y_test, Y_test_predict)

##### Naive Bayes

In [237]:
mnb=MultinomialNB()
mnb.fit(vectorized_X_train_smote,Y_train_smote)
Y_train_predict=mnb.predict(vectorized_X_train_smote)
Y_test_predict=mnb.predict(vectorized_X_test)
training_scores(Y_train_smote,Y_train_predict)
validation_scores(Y_test, Y_test_predict)

##### Decision Tree

In [238]:
dt=DecisionTreeClassifier()
dt.fit(vectorized_X_train_smote,Y_train_smote)
Y_train_predict=dt.predict(vectorized_X_train_smote)
Y_test_predict=dt.predict(vectorized_X_test)
training_scores(Y_train_smote,Y_train_predict)
validation_scores(Y_test, Y_test_predict)

##### Random forest

In [239]:
rf=RandomForestClassifier()
rf.fit(vectorized_X_train_smote,Y_train_smote)
Y_train_predict=rf.predict(vectorized_X_train_smote)
Y_test_predict=rf.predict(vectorized_X_test)
training_scores(Y_train_smote,Y_train_predict)
validation_scores(Y_test, Y_test_predict)

##### Support vector machine

In [240]:
svm=SVC()
svm.fit(vectorized_X_train_smote,Y_train_smote)
Y_train_predict=svm.predict(vectorized_X_train_smote)
Y_test_predict=svm.predict(vectorized_X_test)
training_scores(Y_train_smote,Y_train_predict)
validation_scores(Y_test, Y_test_predict)

### saving the model

In [241]:
import pickle
with open ('../static/model/sentiment_model.pickle','wb') as file:
    pickle.dump(mnb,file)

### Model Training & Evaluation for Categorization

##### logistic regression

In [242]:
lrc=LogisticRegression()
lrc.fit(vectorized_X_train_smote_category,Z_train_smote)

In [243]:
Z_train_predict=lrc.predict(vectorized_X_train_smote_category)

In [246]:
Z_train_predict

In [247]:
Z_train_smote

In [248]:
training_scores(Z_train_smote,Z_train_predict)

In [252]:
Z_test_predict=lrc.predict(vectorized_X_test)

In [255]:
Z_test_predict

In [256]:
Z_test

In [257]:
 validation_scores(Z_test, Z_test_predict)

##### Naive Baiyes

In [258]:
mnbc=MultinomialNB()
mnbc.fit(vectorized_X_train_smote_category,Z_train_smote)
Z_train_predict=mnbc.predict(vectorized_X_train_smote_category)
Z_test_predict=mnbc.predict(vectorized_X_test)
training_scores(Z_train_smote,Z_train_predict)
validation_scores(Z_test, Z_test_predict)

##### Decision Tree

In [259]:
dtc=DecisionTreeClassifier()
dtc.fit(vectorized_X_train_smote_category,Z_train_smote)
Z_train_predict=dtc.predict(vectorized_X_train_smote_category)
Z_test_predict=dtc.predict(vectorized_X_test)
training_scores(Z_train_smote,Z_train_predict)
validation_scores(Z_test, Z_test_predict)

##### Random Forest

In [260]:
rfc=RandomForestClassifier()
rfc.fit(vectorized_X_train_smote_category,Z_train_smote)
Z_train_predict=rfc.predict(vectorized_X_train_smote_category)
Z_test_predict=rfc.predict(vectorized_X_test)
training_scores(Z_train_smote,Z_train_predict)
validation_scores(Z_test, Z_test_predict)

##### Support vector machine

In [261]:
svmc=SVC()
svmc.fit(vectorized_X_train_smote_category,Z_train_smote)
Z_train_predict=svmc.predict(vectorized_X_train_smote_category)
Z_test_predict=svmc.predict(vectorized_X_test)
training_scores(Z_train_smote,Z_train_predict)
validation_scores(Z_test, Z_test_predict)

### Saving the model

In [262]:
import pickle
with open ('../static/model/categorization_model.pickle','wb') as file:
    pickle.dump(mnbc,file)