## **Natural Lunguage Processing Project (NLP)**

**Business Problem**

* This analysis will aim to build a model that can rate the sentiment of a Tweet based on its content.

**Objectives**

* To build a multimodal classifier that will accurately classify tweets into positive, negative and neutral



In [1]:
import pandas as pd
import seaborn as sns
from nltk.corpus import stopwords
from collections import  Counter
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import emoji
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

* **Loading the dataset**

In [3]:
data = pd.read_csv('data.csv', encoding='ISO-8859-1')

In [4]:
#Checking on the tail and head
data

* **Sampling the dataset to have a random glimpse of the tweets**

In [5]:
data.sample(10)

#### **DATA UNDERSTANDING**

* Below function checks data information, value_counts, null_values in the corpus, columns and statitical description of the corpus.

In [6]:
def data_understanding(tweets):
    # Shape of the dataset
    display(f"The shape of the dataset is: {data.shape}")
    print('*'*69, '\n')

    # Basic info abt the dataframe
    data.info()
    print('*'*69, '\n')

    # Value count in the target column
    emotion_count = data['is_there_an_emotion_directed_at_a_brand_or_product'].value_counts()
    print(emotion_count, '\n')
    print('*'*69)

    # Check for Null
    print("Null Values", '\n')
    display(data.isnull().sum())
    print('*'*69, '\n')

    # Columns present
    print("Available columns", '\n')
    display(data.columns)

    display(data.describe())

data_understanding(data)

### **Data preprocessing and cleaning**

* This step entails dowloading stopwords, plukt and wordnet from nltk.


#### Data Cleaning

In [7]:
import warnings
warnings.filterwarnings('ignore')

In [8]:
nltk.download('wordnet') 

In [9]:
nltk.download('omw-1.4') #Open Multilingual Wordnet (OMW)

In [10]:
# Download NLTK data for stopwords and punkt (tokenization)
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

* The function below performs conversion to lowercase,removal of htmls, hrefs, punctuations, tokenize the texts, join the tokensand also lematize the texts

In [11]:
# Load NLTK stopwords
stop_words = set(stopwords.words('english'))

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to preprocess text
def preprocess_text(text):
    # Check if the text is not NaN (handles missing values)
    if not pd.isna(text):
        # Convert text to lowercase
        text = text.lower()

        # Remove HTML tags
        text = BeautifulSoup(text, 'html.parser').get_text()

        # Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        
        # Handle Mentioned Usernames
        text = re.sub(r'@\w+', '', text)

        # Remove non-alphanumeric characters (including numbers)
        text = re.sub(r'[^a-zA-Z\s]', '', text)

        # Remove extra whitespaces and tokenize
        text = ' '.join(word_tokenize(text))
        
#         # Apply spaCy NER
#         doc = data(tweet_text)
#         entities = ' '.join([ent.text for ent in doc.ents])

        # Remove stopwords and lemmatize
        text = ' '.join(lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words)

    return text

# Apply the preprocessing function to the 'tweet_text' column in your DataFrame 'data'
data['tweet_text'] = data['tweet_text'].apply(preprocess_text)


In [12]:
data

In [13]:
# Rename columns
data.rename(columns={'tweet_text': 'tweets', 'emotion_in_tweet_is_directed_at': 'brand', 'is_there_an_emotion_directed_at_a_brand_or_product': 'emotion'}, inplace=True)
data

In [14]:
#Checking on the null values in the columns
data[['brand','tweets','emotion']].isna().sum()

In [15]:
# Checking on duplicates

data['tweets'].duplicated().sum()

In [16]:
# Handling the null values by mapping the tweets with brand names to its brand in the brand column

keywords = ['google', 'apple', 'ipad', 'android', 'iphone','samsung','sony']

def find_brand(text):
    if isinstance(text, str):
        lower_text = text.lower()
        for keyword in keywords:
            if keyword in lower_text:
                return keyword
    return None

data['brand'] = data.apply(lambda row: row['brand'] if not pd.isna(row['brand']) else find_brand(row['tweets']), axis=1)

In [17]:
data

In [18]:
#Rechecking the null values
data['brand'].isna().sum()

In [19]:
#Removing the null values
data.dropna(inplace=True)

In [20]:
#Rechecking for the null values
data['brand'].isna().sum()

In [21]:
data["tweets"].duplicated().sum()

In [22]:
# Dropping the duplicated tweets
data.drop_duplicates(subset=['tweets'], inplace=True)

In [23]:
# Rechecking the tweets for duplicates
duplicated_tweets= data['tweets'].duplicated().sum()
duplicated_tweets

In [24]:
data.sample(5)

In [25]:
# Replace 'sxsw' with a space
data['tweets'] = data['tweets'].str.replace('sxsw', ' ', case=False)

# Display the updated DataFrame
data.sample(5)

#### EDA

In [26]:
import matplotlib.pyplot as plt

# Assuming 'data' is your DataFrame with a column named 'emotion'
sentiment_counts = data['emotion'].value_counts()

# Data
categories = sentiment_counts.index
sentiment_values = sentiment_counts.values

# Plotting
plt.figure(figsize=(10, 7))
plt.bar(categories, sentiment_values, color=['lightgreen', 'salmon', 'lightgray'])

# Title & Labels
plt.title('Distribution of Sentiments', fontweight='bold')
plt.xlabel('Sentiment Category', fontweight='bold')
plt.ylabel('Count', fontweight='bold')

plt.show()


In [27]:
from wordcloud import WordCloud
from textwrap import wrap

normal_words =' '.join([text for text in data['tweets']])
                        
# Generate the word cloud with the specified font path
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(normal_words)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [28]:
#Checking on unique products

data[['tweets', 'brand', 'emotion']] = data[['tweets', 'brand', 'emotion']].apply(lambda x: x.str.lower())


In [29]:
data.brand.unique()

In [30]:
# replacing other names

data['brand'] = data['brand'].str.replace('ipad or iphone app', 'iphone')
data['brand'] = data['brand'].str.replace('other google product or service', 'google')
data['brand'] = data['brand'].str.replace('other apple product or service', 'iphone')
data['brand'] = data['brand'].str.replace('apple', 'iphone')
data['brand'] = data['brand'].str.replace('ipad', 'iphone')
# data['brand'] = data['brand'].str.replace([['android','android app']], 'other brands')
# data['brand'] = data['brand'].str.replace('android app','other brands')

In [31]:
data.brand.unique()

In [32]:
# Grouping the dataframe by product and the emotions associated with each individual brand/company

gb = data.groupby('brand')['emotion'].value_counts()

# Creating the barplot

gb.unstack(level=1).plot(kind='bar', 
                         figsize = (10,7), 
                         fontsize = 13, 
                         rot = 75,
                         ylabel = 'Frequency',
                         xlabel = 'brand/Company',
                         title = 'Tweet Emotion Frequency by Product',
                         colormap = 'Set2'
                        )
plt.show()

In [33]:
#Brand mentions in the tweets

c_freq = sns.barplot(x = data['brand'].value_counts(normalize = True).index, 
                    y = data['brand'].value_counts(normalize = True).values,
                    order = ['iphone', 'google', 'android',''],
                    palette = 'Set2'
                    )
c_freq.set(xlabel = 'Company', 
           ylabel = 'Relative Frequency',
           title = 'Company Mention in Tweet Frequency'
          )
plt.show()

In [34]:
#pip install spacy

In [35]:
#!python -m spacy download en_core_web_sm


In [36]:
# Checking frequency of tokens from the tweets

from collections import Counter

# Assuming 'tweet_text' column has been preprocessed
preprocessed_tokens = ' '.join(data['tweets']).split()

# Count the frequency of each token
token_frequency = Counter(preprocessed_tokens)

# Display the most common tokens and their frequencies
token_frequency.most_common(50)


In [37]:
# Plotting top 20 most common tokens

top_20_tokens= token_frequency.most_common(20)

# Extract tokens and frequencies
tokens, frequencies = zip(*top_20_tokens)

# Plotting the horizontal bar graph
plt.figure(figsize=(10, 8))
plt.barh(tokens, frequencies, color='skyblue')
plt.xlabel('Frequency')
plt.title('Top 20 Most Common Tokens')
plt.gca().invert_yaxis()  # To have the highest frequency at the top
plt.show()

In [38]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

#defining a function for wordcloud generation
def generate_wordcloud(tweets, collocations=False, background_color='black', 
                       colormap='Greens', display=True):
    
    ## Initalize a WordCloud
    wordcloud = WordCloud(collocations=collocations, 
                          background_color=background_color, 
                          colormap=colormap, 
                          width=500, height=300)

    ## Generate wordcloud from tokens
    wordcloud.generate(','.join(tokens))

    ## Plot with matplotlib
    if display:
        plt.figure(figsize = (12, 15), facecolor = None) 
        plt.imshow(wordcloud) 
        plt.axis('off');
    return wordcloud

In [39]:
#generating word cloud
cloud_pos_w_company = generate_wordcloud(token_frequency.most_common(), collocations=True)

In [40]:
data

In [41]:
import nltk
nltk.download('averaged_perceptron_tagger')

In [42]:
# POS tagging
from nltk import pos_tag
from nltk.tokenize import word_tokenize

def tweets_tagging(tweets):
    tokens=word_tokenize(tweets)
    pos_tags=pos_tag(tokens)
    return pos_tags

#Applying the function in tweets
data["pos_tags"]=data["tweets"].apply(tweets_tagging)

print(data[["tweets","pos_tags"]])

In [43]:
import spacy

# Load the English language model
nlp = spacy.load("en_core_web_sm")

# Create a list to store named entities for each tweet
named_entities_list = []

# Process each tweet individually
for tweet in data["tweets"]:
    doc = nlp(tweet)

    # Extract named entities for each tweet
    named_entities = [(ent.text, ent.label_) for ent in doc.ents]
    named_entities_list.append(named_entities)

# Print or use the list as needed
print(named_entities_list)


**Polarity score using Vader**

In [None]:
nltk.download('vader_lexicon')

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm

sia= SentimentIntensityAnalyzer()

In [None]:

text = data["tweets"]
res = []

for i, row in tqdm(data.iterrows(), total=len(data)):
    mtweet = row["tweets"]
    score = sia.polarity_scores(mtweet)
    res.append(score)


In [None]:
#Converting into dataframe
vaders=pd.DataFrame(res)
vaders.sample(5)

In [None]:
#merging the palarities with the metadata
# merged_df = vaders.merge(data, on="tweets", how="left")

# print(merged_df)
merged_df = vaders.merge(data, left_index=True, right_index=True, how="left")


In [None]:
merged_df.sample(5)

In [None]:
import seaborn as sns
ax=sns.barplot(data=vaders, x =merged_df['brand'], y='compound')
ax.set_title("polarity scores plot")
plt.show()

In [None]:
#Positive sentiment polarities
import seaborn as sns
ax=sns.barplot(data=vaders, x =merged_df['brand'], y='pos')
ax.set_title("polarity scores plot")
plt.show()

In [None]:
#Negative sentiment polarities
import seaborn as sns
ax=sns.barplot(data=vaders, x =merged_df['brand'], y='neg')
ax.set_title("polarity scores plot")
plt.show()

In [None]:
#Neutral sentiment polarities
import seaborn as sns
ax=sns.barplot(data=vaders, x =merged_df['brand'], y='neu')
ax.set_title("polarity scores plot")
plt.show()

#### FEATURE ENGINEERING

#### TF-IDF (Term Frequency-Inverse Document Frequency):

Convert text data into numerical vectors, emphasizing words that are unique to particular documents.
##### Word2Vec:

Generate dense vector representations of words, capturing semantic meanings based on contextual usage.
##### Doc2Vec:

Generate vector representations for entire documents or sentences, capturing the contextual and semantic information where understanding the semantic context of entire texts is crucial.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
# vectorization
vectorizer            =  TfidfVectorizer()
data_vec =  vectorizer.fit_transform(data['tweets']).toarray()

In [None]:
data_vec

In [None]:
data_vec_df = pd.DataFrame(data_vec)

####  Word2Vec

In [None]:
import random
import numpy as np
from gensim.models import Word2Vec

# Set a random seed for reproducibility
seed = 42
np.random.seed(seed)
random.seed(seed)

# Train Word2Vec model with a fixed seed
word2vec_model = Word2Vec(sentences=data['tweets'], vector_size=100, window=5, min_count=1, workers=4, seed=seed)

# Function to calculate the vector for a document by averaging the vectors of the words in the document
def document_vector(word2vec_model, doc):
    # remove out-of-vocabulary words
    doc = [word for word in doc if word in word2vec_model.wv.key_to_index]
    
    # If no words of the doc are in the word2vec vocab, return a vector of zeros
    if len(doc) == 0:
        return np.zeros(word2vec_model.vector_size)
    
    # Otherwise, get the mean of the vectors
    return np.mean(word2vec_model.wv[doc], axis=0)

# Apply function to calculate vectors for all documents
word2vec_vectors = data['tweets'].apply(lambda x: document_vector(word2vec_model, x))
word2vec_df = pd.DataFrame(word2vec_vectors.tolist())

In [None]:
word2vec_df

### Modelling

In [None]:
!pip install --upgrade scikit-learn


In [None]:
!pip install imbalanced-learn==0.8.0


In [None]:
# Define a function for modeling
def train_and_evaluate(model, X, y, test_size=0.2):
    """
    Train and evaluate a machine learning model.

    :param model: model object
    :param X: Features
    :param y: Target variable
    :param test_size: Fraction of dataset to be used as test set
    """
    # Split the data
    # Splitting the data
    X = word2vec_df  # Features (TF-IDF values)
    y =  data['emotion']  # Target variable
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Check if the model is a classifier
    if not is_classifier(model):
        raise ValueError("Model is not a classifier")

    # Create a pipeline with a scaler, SMOTE, and the model
    pipeline = Pipeline([
        #('scaler', StandardScaler(with_mean=False)),  # Use with_mean=False to support sparse input
        ('smote', SMOTE(random_state=42)),
        ('model', model)
    ])

    # Measure the training time
    start_time = time.time()

    # Train the model
    pipeline.fit(X_train, y_train)

    # Record the end time
    end_time = time.time()

    # Calculate the training time
    training_time = end_time - start_time
    print("Training time:", training_time, "seconds")
    print("\n")

    # Predictions
    train_preds = pipeline.predict(X_train)
    test_preds = pipeline.predict(X_test)

    # Probability estimates for each class
    train_preds_proba = pipeline.predict_proba(X_train)
    test_preds_proba = pipeline.predict_proba(X_test)

    # Evaluation
    print("Model Performance")
    print("------------------")

    # Train Metrics
    train_accuracy = accuracy_score(y_train, train_preds)
    train_f1 = f1_score(y_train, train_preds, average='weighted')
    train_roc_auc = roc_auc_score(y_train, train_preds_proba, multi_class='ovr', average='weighted')

    print("Train Accuracy: ", train_accuracy)
    print("Train F1 Score: ", train_f1)
    print("Train ROC-AUC Score: ", train_roc_auc)
    print("\nClassification Report (Train Data):\n", classification_report(y_train, train_preds))

    # Test Metrics
    test_accuracy = accuracy_score(y_test, test_preds)
    test_f1 = f1_score(y_test, test_preds, average='weighted')
    test_roc_auc = roc_auc_score(y_test, test_preds_proba, multi_class='ovr', average='weighted')

    print("\nTest Accuracy: ", test_accuracy)
    print("Test F1 Score: ", test_f1)
    print("Test ROC-AUC Score: ", test_roc_auc)
    print("\nClassification Report (Test Data):\n", classification_report(y_test, test_preds))

    # Create a DataFrame to store the metrics
    metrics_df = pd.DataFrame({
        'Model': [str(model)],
        'Train_Accuracy': [train_accuracy],
        'Train_F1': [train_f1],
        'Train_ROC_AUC': [train_roc_auc],
        'Test_Accuracy': [test_accuracy],
        'Test_F1': [test_f1],
        'Test_ROC_AUC': [test_roc_auc],
        'Training_Time': [training_time]
    })

    return pipeline, metrics_df

#### SUPPORT VECTOR MACHINE

In [None]:



# Instantiate and train SVC
model1 = SVC(decision_function_shape='ovo')
trained_svm_model, svm_metrics = train_and_evaluate(model1, X, y)