In [97]:
#Text preprocessing. Cleaning/ preparing data.
#Tokenization. Dividing sentences into words.
#Stopword removal. Removing irrelevant words.
#Stemming & Lemmatization. Reducing words to base form; Grammar
#Named Entry recognition. Extract names , places . dates
#Text Vectorisation. # Convert text to Numbers.
# modelling. Create your model.

In [98]:
# Import necessary libraries.
import pandas as pd
import string
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import PorterStemmer
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV,RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report
import joblib


In [99]:
data = pd.read_csv('Twitter_Data.csv')

In [100]:
display(data.head())

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [101]:
#Step 1; Text Preprocessing.
#Drop missing rows.
data.dropna(inplace=True)

# Convert texts to lowercase.
type(data['clean_text'])
data['clean_text']= data['clean_text'].astype(str)
data['clean_text'] = data['clean_text'].str.lower()

#Remove symbols.
def clean_text(text):
   text = re.sub(r'http\S+|www.\S+', '', text) #Remove Urls.
   text = re.sub(r'@\w+|#\w+', '', text) #Remove mentions and hashtags.
   text = re.sub(r'\d+', '', text) #Remove digits.
   text = re.sub(r'[^\w\s]', '', text) #Remove punctuation.
   return text

data['clean_text'] = data['clean_text'].apply(clean_text)
data.head()

Unnamed: 0,clean_text,category
0,when modi promised minimum government maximum ...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [102]:
# Step 2; Tokenization
#Download tokenizer model
import nltk
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize

# Create to process clean_text
class TextProcessing:
  def __init__(self,column):
    self.column = column

  def tokenizeText(self, text):
    return word_tokenize(str(text))

  def apply(self, df):
    df['tokens'] = df[self.column].map(self.tokenizeText)
    return df

# Apply tokenization on column 'clean_text'
processor = TextProcessing(column='clean_text')
data = processor.apply(data)

data.head()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/faithkamande/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Unnamed: 0,clean_text,category,tokens
0,when modi promised minimum government maximum ...,-1.0,"[when, modi, promised, minimum, government, ma..."
1,talk all the nonsense and continue all the dra...,0.0,"[talk, all, the, nonsense, and, continue, all,..."
2,what did just say vote for modi welcome bjp t...,1.0,"[what, did, just, say, vote, for, modi, welcom..."
3,asking his supporters prefix chowkidar their n...,1.0,"[asking, his, supporters, prefix, chowkidar, t..."
4,answer who among these the most powerful world...,1.0,"[answer, who, among, these, the, most, powerfu..."


In [103]:
display(data.head())

Unnamed: 0,clean_text,category,tokens
0,when modi promised minimum government maximum ...,-1.0,"[when, modi, promised, minimum, government, ma..."
1,talk all the nonsense and continue all the dra...,0.0,"[talk, all, the, nonsense, and, continue, all,..."
2,what did just say vote for modi welcome bjp t...,1.0,"[what, did, just, say, vote, for, modi, welcom..."
3,asking his supporters prefix chowkidar their n...,1.0,"[asking, his, supporters, prefix, chowkidar, t..."
4,answer who among these the most powerful world...,1.0,"[answer, who, among, these, the, most, powerfu..."


In [104]:
# Step 3: Removing Stopwords
# Downloading stopwords
nltk.download('stopwords')

#defining english stopwords
english_stopwords = set(stopwords.words('english'))

#Function to remove the stopwords
def remove_stopwords(row_tokens):
    cleaned = []
    for word in row_tokens:
        if word not in english_stopwords:
            cleaned.append(word)
    return cleaned

# applying the function to the data
data['stopword_tokens'] = data['tokens'].apply(remove_stopwords)
data['tokens']

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/faithkamande/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0         [when, modi, promised, minimum, government, ma...
1         [talk, all, the, nonsense, and, continue, all,...
2         [what, did, just, say, vote, for, modi, welcom...
3         [asking, his, supporters, prefix, chowkidar, t...
4         [answer, who, among, these, the, most, powerfu...
                                ...                        
162975    [why, these, crores, paid, neerav, modi, not, ...
162976    [dear, rss, terrorist, payal, gawar, what, abo...
162977    [did, you, cover, her, interaction, forum, whe...
162978    [there, big, project, came, into, india, modi,...
162979    [have, you, ever, listen, about, like, gurukul...
Name: tokens, Length: 162969, dtype: object

In [105]:
data.head()

Unnamed: 0,clean_text,category,tokens,stopword_tokens
0,when modi promised minimum government maximum ...,-1.0,"[when, modi, promised, minimum, government, ma...","[modi, promised, minimum, government, maximum,..."
1,talk all the nonsense and continue all the dra...,0.0,"[talk, all, the, nonsense, and, continue, all,...","[talk, nonsense, continue, drama, vote, modi]"
2,what did just say vote for modi welcome bjp t...,1.0,"[what, did, just, say, vote, for, modi, welcom...","[say, vote, modi, welcome, bjp, told, rahul, m..."
3,asking his supporters prefix chowkidar their n...,1.0,"[asking, his, supporters, prefix, chowkidar, t...","[asking, supporters, prefix, chowkidar, names,..."
4,answer who among these the most powerful world...,1.0,"[answer, who, among, these, the, most, powerfu...","[answer, among, powerful, world, leader, today..."


In [106]:
#Step 4: Stemming.
from nltk.stem import PorterStemmer

# Defining the stemmer
stemmer = PorterStemmer()

# Function for stemming the text
def stem_tokens(tokens):
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    return stemmed_tokens

data['stemmed_tokens'] = data['stopword_tokens'].apply(stem_tokens)

In [107]:
data.head()

Unnamed: 0,clean_text,category,tokens,stopword_tokens,stemmed_tokens
0,when modi promised minimum government maximum ...,-1.0,"[when, modi, promised, minimum, government, ma...","[modi, promised, minimum, government, maximum,...","[modi, promis, minimum, govern, maximum, gover..."
1,talk all the nonsense and continue all the dra...,0.0,"[talk, all, the, nonsense, and, continue, all,...","[talk, nonsense, continue, drama, vote, modi]","[talk, nonsens, continu, drama, vote, modi]"
2,what did just say vote for modi welcome bjp t...,1.0,"[what, did, just, say, vote, for, modi, welcom...","[say, vote, modi, welcome, bjp, told, rahul, m...","[say, vote, modi, welcom, bjp, told, rahul, ma..."
3,asking his supporters prefix chowkidar their n...,1.0,"[asking, his, supporters, prefix, chowkidar, t...","[asking, supporters, prefix, chowkidar, names,...","[ask, support, prefix, chowkidar, name, modi, ..."
4,answer who among these the most powerful world...,1.0,"[answer, who, among, these, the, most, powerfu...","[answer, among, powerful, world, leader, today...","[answer, among, power, world, leader, today, t..."


In [108]:
#Step 5: Lemmatizing texts
nltk.download('wordnet')

# Defining the lemmatizer
lemmatizer = WordNetLemmatizer()

# Function forlemmatizing the text
def lemmatize_tokens(tokens):
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return lemmatized_tokens

data['lemmatized_tokens'] = data['stopword_tokens'].apply(lemmatize_tokens)

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/faithkamande/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [109]:
data.head()

Unnamed: 0,clean_text,category,tokens,stopword_tokens,stemmed_tokens,lemmatized_tokens
0,when modi promised minimum government maximum ...,-1.0,"[when, modi, promised, minimum, government, ma...","[modi, promised, minimum, government, maximum,...","[modi, promis, minimum, govern, maximum, gover...","[modi, promised, minimum, government, maximum,..."
1,talk all the nonsense and continue all the dra...,0.0,"[talk, all, the, nonsense, and, continue, all,...","[talk, nonsense, continue, drama, vote, modi]","[talk, nonsens, continu, drama, vote, modi]","[talk, nonsense, continue, drama, vote, modi]"
2,what did just say vote for modi welcome bjp t...,1.0,"[what, did, just, say, vote, for, modi, welcom...","[say, vote, modi, welcome, bjp, told, rahul, m...","[say, vote, modi, welcom, bjp, told, rahul, ma...","[say, vote, modi, welcome, bjp, told, rahul, m..."
3,asking his supporters prefix chowkidar their n...,1.0,"[asking, his, supporters, prefix, chowkidar, t...","[asking, supporters, prefix, chowkidar, names,...","[ask, support, prefix, chowkidar, name, modi, ...","[asking, supporter, prefix, chowkidar, name, m..."
4,answer who among these the most powerful world...,1.0,"[answer, who, among, these, the, most, powerfu...","[answer, among, powerful, world, leader, today...","[answer, among, power, world, leader, today, t...","[answer, among, powerful, world, leader, today..."


In [110]:
# Step 6; Vectorising.

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data['lemmatized_tokens'].apply(lambda x: ' '.join(x)))

print(vectorizer.get_feature_names_out())

print(X.toarray())

['aa' 'aaa' 'aaaa' ... 'แจก' 'ỉnto' '东西南北']
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [111]:
data.to_csv("processed_data.csv",index = False)

### MODELLING.


In [112]:
# MODELLING
# We will use four models with four various hyperparameters to get the best model abd the best parameters.
    #1. Logistic_Regression (solver= 'liblinear'/'multiclass.')
    #2. Random Forest ; parameters(Num_estimators= [1,5,10])
    #3. MultinomialNaive Bayes; parameters, {'alpha': [0.1, 0.5, 1.0, 2.0]}
    #4. SVM model; parameters; {c:[1,10,20], 'kernel':['rbf','linear']}



In [113]:
# Define Model Parameters.
model_params= {
   'logistic_regression': {
       'model': LogisticRegression(solver='liblinear',multi_class='auto'),
       'params': {
           'C': [1,5,10]
       }
   },
   'random_forest': {
       'model': RandomForestClassifier(),
       'params': {
           'n_estimators': [1,5,10]
       }
   },
   'naive_bayes': {
       'model': MultinomialNB(),
       'params': {
           'alpha': [0.1, 0.5, 1.0, 2.0]
       }
   },
   'svm': {
       'model': LinearSVC(),
       'params': {
           'C': [1,10,20]
       }
   }
}

scores=[]

In [114]:
# Implementing the models.

# for model_name,mvp in model_params.items():
#   clf= GridSearchCV(mvp['model'],mvp['params'],cv=5,return_train_score=False)
#   clf.fit(X,data['category'])
#   scores.append({
#       'model': model_name,
#       'best_score': clf.best_score_,
#       'best_params': clf.best_params_
#   })

In [115]:
#Using Randomised search CV

#for model_name,mvp in model_params.items():
  #rs= RandomizedSearchCV(mvp['model'],mvp['params'],cv=5,return_train_score=False,n_iter=2)
  #rs.fit(X,data['category'])
  #scores.append({
      #'model': model_name,
      #'best_score': rs.best_score_,
      #'best_params': rs.best_params_
 # })

In [116]:
df_model_evaluation= pd.DataFrame(scores,columns=['model','best_score','best_params'])
df_model_evaluation

Unnamed: 0,model,best_score,best_params


In [117]:
#DATA PREPARSTION.
#Perform Train test split.
X_train, X_test, y_train, y_test = train_test_split(X, data['category'], test_size=0.3, random_state=42)

print('The train data consists of: ', X_train.shape[0], 'rows')
print('The test data consists of: ', X_test.shape[0], ' rows')

The train data consists of:  114078 rows
The test data consists of:  48891  rows


In [118]:
# Define the model.
model = LogisticRegression(solver='liblinear',multi_class='auto', C= 1)

In [119]:
#Fit the model.
model.fit(X_train,y_train)



In [120]:
def predict_sentiment(scenario):
    # Clean and preprocess the input if needed
    processed = vectorizer.transform([scenario])
    prediction = model.predict(processed)
    
    if prediction[0] == 1:
        print('Positive')
    elif prediction[0] == -1:
        print('Negative')
    else:
        print('Neutral')

In [121]:
# Find Model evaluation.
y_pred= model.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

        -1.0       0.86      0.80      0.83     10691
         0.0       0.88      0.95      0.91     16644
         1.0       0.92      0.89      0.90     21556

    accuracy                           0.89     48891
   macro avg       0.89      0.88      0.88     48891
weighted avg       0.89      0.89      0.89     48891



In [124]:
joblib.dump((vectorizer, model), "Logistic_model.pkl")

['Logistic_model.pkl']