In [153]:
# !pip install spacy
# !python -m spacy download en_core_web_sm

In [154]:
# Import required libraries
import pandas as pd  # For handling and manipulating data in DataFrame
import numpy as np  # For numerical operations
import re  # Regular expression operations for text cleaning
import spacy  # For NLP tasks like tokenization, stopwords, and lemmatization
from sklearn.model_selection import train_test_split  # To split dataset into train and test sets
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer  # Convert text to numerical format
from sklearn.preprocessing import StandardScaler, LabelEncoder  # Normalize numerical data and encode categorical variables
from xgboost import XGBClassifier  # XGBoost classifier for training the model
from sklearn.metrics import classification_report, accuracy_score  # To evaluate model performance
from scipy.sparse import hstack  # For combining sparse matrices (text features, numerical features, etc.)
import requests  # To make HTTP requests to download dataset from GitHub
from io import StringIO  # For handling in-memory text streams
import unicodedata

In [155]:
# Load the spaCy English model
nlp = spacy.load('en_core_web_sm')

In [156]:
# Step 1: Read dataset from GitHub using requests
# Read the CSV file from GitHub using the 'requests' library and convert the response to a pandas DataFrame
# use below code if running on kaggle
# Try reading the CSV file with a different encoding
try:
    data = pd.read_csv('/kaggle/input/tweet-sentiment-dataset/tweet_sentiments_dataset.csv', encoding='utf-8')  # First attempt with utf-8
except UnicodeDecodeError:
    data = pd.read_csv('/kaggle/input/tweet-sentiment-dataset/tweet_sentiments_dataset.csv', encoding='ISO-8859-1')  # Fallback to ISO-8859-1 or latin1

#use below code if running other than kaggle with connection limitation
# url = 'https://github.com/JainMradul/NLP-datasets/blob/main/social_media_sentiment_dataset.csv'
# response = requests.get(url).text  # Fetch data from the URL as text
# data = pd.read_csv(StringIO(response))  # Convert the text data into a DataFrame using StringIO
data.head()



Unnamed: 0,Text,Tweet_About,Sentiment
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive


In [157]:
# Step 2: Check categories in sentiment variable

data['Sentiment'].value_counts()

Sentiment
Neutral     5544
Positive    2978
Negative     570
Name: count, dtype: int64

In [158]:
# Lets filter the dataset to classify Positive or Negative
# Remove Neutral tweets

data = data[data['Sentiment'].isin(['Positive','Negative'])]
data = data.reset_index(drop=True)
print(data.shape,data['Sentiment'].value_counts())

(3548, 3) Sentiment
Positive    2978
Negative     570
Name: count, dtype: int64


In [159]:

# Step 3: Data Preprocessing - Handle text columns 

# Define a function to preprocess text using spaCy (stopwords removal, lemmatization)
def preprocess_text_spacy(text):
    """
    Preprocess the input text by:
    - Normalizing and removing special characters
    - Removing stopwords
    - Applying lemmatization
    
    Parameters:
        text (str): The input text to be cleaned and lemmatized using spaCy
    
    Returns:
        str: Cleaned and processed text
    """
    # Normalize the text to remove special characters (e.g., û, ï, etc.)
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')
    
    # Remove any remaining non-alphanumeric characters and convert to lowercase
    text = re.sub(r'[^\w\s]', '', text).lower()
    
    # Process the text using spaCy
    doc = nlp(text)
    
    # Lemmatize and remove stopwords
    processed_text = ' '.join([token.lemma_ for token in doc if not token.is_stop])
    
    return processed_text

In [160]:
# The regular expression pattern r'[^\w\s]' will match and remove any character that is not a word character or a whitespace character. Let's break it down:

# \w: Matches any word character, which includes:

# Any letter (uppercase or lowercase), such as a-z, A-Z
# Digits (0-9)
# The underscore (_)
# \s: Matches any whitespace character, which includes:

# Spaces, tabs, newlines, etc.
# ^: The caret inside the square brackets negates the set, meaning the expression will match anything that is not a word character (\w) or a whitespace character (\s).

In [161]:
# Apply the text preprocessing function to the 'Text' and 'tweet about' columns
data['clean_text'] = data['Text'].astype('str').apply(preprocess_text_spacy)  # Preprocess the 'Text' column
data['clean_tweet_about'] = data['Tweet_About'].astype('str').apply(preprocess_text_spacy)  # Preprocess the 'tweet about' column
data[["Text","clean_text","clean_tweet_about","Tweet_About"]].head()

Unnamed: 0,Text,clean_text,clean_tweet_about,Tweet_About
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,wesley83 3 g iphone 3 hrs tweet rise_austin de...,iphone,iPhone
1,@jessedee Know about @fludapp ? Awesome iPad/i...,jessedee know fludapp awesome ipadiphone app...,ipad iphone app,iPad or iPhone App
2,@swonderlin Can not wait for #iPad 2 also. The...,swonderlin wait ipad 2 sale sxsw,ipad,iPad
3,@sxsw I hope this year's festival isn't as cra...,sxsw hope year festival not crashy year iphone...,ipad iphone app,iPad or iPhone App
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,sxtxstate great stuff fri sxsw marissa mayer g...,google,Google


In [162]:
# Concatenate two columns and remove duplicate words
data['text_processed'] = data[['clean_text', 'clean_tweet_about']].apply(lambda row: ' '.join(dict.fromkeys((row['clean_text'] + ' ' + row['clean_tweet_about']).split())), axis=1)
data.head()

Unnamed: 0,Text,Tweet_About,Sentiment,clean_text,clean_tweet_about,text_processed
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative,wesley83 3 g iphone 3 hrs tweet rise_austin de...,iphone,wesley83 3 g iphone hrs tweet rise_austin dead...
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive,jessedee know fludapp awesome ipadiphone app...,ipad iphone app,jessedee know fludapp awesome ipadiphone app l...
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive,swonderlin wait ipad 2 sale sxsw,ipad,swonderlin wait ipad 2 sale sxsw
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative,sxsw hope year festival not crashy year iphone...,ipad iphone app,sxsw hope year festival not crashy iphone app ...
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive,sxtxstate great stuff fri sxsw marissa mayer g...,google,sxtxstate great stuff fri sxsw marissa mayer g...


In [163]:
#Lets remove the columns not required for modelling
data.drop(['Text','Tweet_About','clean_text','clean_tweet_about'],axis=1,inplace=True)
data.head()

Unnamed: 0,Sentiment,text_processed
0,Negative,wesley83 3 g iphone hrs tweet rise_austin dead...
1,Positive,jessedee know fludapp awesome ipadiphone app l...
2,Positive,swonderlin wait ipad 2 sale sxsw
3,Negative,sxsw hope year festival not crashy iphone app ...
4,Positive,sxtxstate great stuff fri sxsw marissa mayer g...


In [164]:
# Step 4: Encode the target variable 'Sentiment'
# Initialize the LabelEncoder to convert categorical labels to numerical format
label_encoder = LabelEncoder()

# Apply label encoding to the 'Sentiment' column and store the result in a new column 'sentiment_encoded'
data['sentiment_encoded'] = label_encoder.fit_transform(data['Sentiment'])


In [165]:
data.head()

Unnamed: 0,Sentiment,text_processed,sentiment_encoded
0,Negative,wesley83 3 g iphone hrs tweet rise_austin dead...,0
1,Positive,jessedee know fludapp awesome ipadiphone app l...,1
2,Positive,swonderlin wait ipad 2 sale sxsw,1
3,Negative,sxsw hope year festival not crashy iphone app ...,0
4,Positive,sxtxstate great stuff fri sxsw marissa mayer g...,1


In [166]:
#Lets remove Sentiment column
data.drop(['Sentiment'],axis=1,inplace=True)
data.head()

Unnamed: 0,text_processed,sentiment_encoded
0,wesley83 3 g iphone hrs tweet rise_austin dead...,0
1,jessedee know fludapp awesome ipadiphone app l...,1
2,swonderlin wait ipad 2 sale sxsw,1
3,sxsw hope year festival not crashy iphone app ...,0
4,sxtxstate great stuff fri sxsw marissa mayer g...,1


In [167]:
data['sentiment_encoded'].unique()

array([0, 1])

In [168]:
# Step 5: Vectorization of text data (Bag of Words or TF-IDF)
# Define a function to convert text data to numerical vectors using either CountVectorizer (Bag of Words) or TfidfVectorizer (TF-IDF)
def text_vectorization(method,df,text_col):
    """
    Convert text data into numerical vectors using either Bag of Words or TF-IDF,
    and return DataFrames with feature names and their corresponding word representations.
    
    Parameters:
        method (str): 'bow' for Bag of Words or 'tfidf' for TF-IDF (default is 'tfidf')
    
    Returns:
        tuple: DataFrames of feature names and word representations for text and hashtag features
    """
    if method == 'bow':
        vectorizer = CountVectorizer()  # Initialize CountVectorizer for Bag of Words
    else:
        vectorizer = TfidfVectorizer()  # Initialize TfidfVectorizer for TF-IDF
    
    # Apply the vectorizer to the text  columns
    text_features = vectorizer.fit_transform(df[text_col])  
    
    # Retrieve feature names
    text_feature_names = vectorizer.get_feature_names_out()
    
    # Create DataFrames with feature names and their corresponding word representations
    text_df = pd.DataFrame(text_features.toarray(), columns=text_feature_names)
    
    return text_df



In [169]:
# Choose 'bow' for Bag of Words or 'tfidf' for TF-IDF
df_vectorized = text_vectorization(method='bow2',df=data,text_col='text_processed')  # Convert text to numerical features using TF-IDF


In [170]:
data.shape,df_vectorized.shape,df_vectorized.head()

((3548, 2),
 (3548, 5449),
     02   03  0310  0310apple   10  100  1000  10000  100s  100tc  ...  zero  \
 0  0.0  0.0   0.0        0.0  0.0  0.0   0.0    0.0   0.0    0.0  ...   0.0   
 1  0.0  0.0   0.0        0.0  0.0  0.0   0.0    0.0   0.0    0.0  ...   0.0   
 2  0.0  0.0   0.0        0.0  0.0  0.0   0.0    0.0   0.0    0.0  ...   0.0   
 3  0.0  0.0   0.0        0.0  0.0  0.0   0.0    0.0   0.0    0.0  ...   0.0   
 4  0.0  0.0   0.0        0.0  0.0  0.0   0.0    0.0   0.0    0.0  ...   0.0   
 
    zimride  zip  zite  zms  zombie  zomg  zone  zoom  zzzs  
 0      0.0  0.0   0.0  0.0     0.0   0.0   0.0   0.0   0.0  
 1      0.0  0.0   0.0  0.0     0.0   0.0   0.0   0.0   0.0  
 2      0.0  0.0   0.0  0.0     0.0   0.0   0.0   0.0   0.0  
 3      0.0  0.0   0.0  0.0     0.0   0.0   0.0   0.0   0.0  
 4      0.0  0.0   0.0  0.0     0.0   0.0   0.0   0.0   0.0  
 
 [5 rows x 5449 columns])

In [171]:
df_processed = pd.concat([df_vectorized, data],axis=1)
df_processed.shape

(3548, 5451)

In [172]:
df_processed.drop('text_processed',axis=1,inplace=True)

In [173]:
df_processed['sentiment_encoded'].unique()

array([0, 1])

In [174]:
# Step 9: Split the dataset into train and test sets
# Use train_test_split to split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(
    df_processed, 
    df_processed['sentiment_encoded'], 
    test_size=0.2, 
    random_state=42, 
    stratify=df_processed['sentiment_encoded']
)

In [175]:
X_train.drop('sentiment_encoded',axis=1,inplace=True)
X_test.drop('sentiment_encoded',axis=1,inplace=True)

In [176]:
# Step 10: Train model using XGBoost with basic hyperparameter tuning
# Initialize the XGBClassifier with basic hyperparameters
xgb_model = XGBClassifier(learning_rate=0.1, n_estimators=100, max_depth=6, random_state=42)

# Fit the model to the training data
xgb_model.fit(X_train, y_train)



In [177]:
# Step 11: Test the model on the test dataset and evaluate metrics
# Predict the labels for the test set using the trained model
y_pred = xgb_model.predict(X_test)

# Step 12: Evaluate the model performance
# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)

# Generate a classification report (precision, recall, F1-score, etc.)
classification_metrics = classification_report(y_test, y_pred, target_names=label_encoder.classes_)


In [178]:
# Output the accuracy and classification report
print(f'Accuracy: {accuracy}')  # Print the accuracy score
print('Classification Report:')
print(classification_metrics)  # Print the classification report


Accuracy: 0.8577464788732394
Classification Report:
              precision    recall  f1-score   support

    Negative       0.68      0.22      0.33       114
    Positive       0.87      0.98      0.92       596

    accuracy                           0.86       710
   macro avg       0.77      0.60      0.63       710
weighted avg       0.84      0.86      0.83       710



In [180]:
#bow

Accuracy: 0.8661971830985915
Classification Report:
              precision    recall  f1-score   support

    Negative       0.74      0.25      0.38       114
    Positive       0.87      0.98      0.93       596

    accuracy                           0.87       710
   macro avg       0.81      0.62      0.65       710
weighted avg       0.85      0.87      0.84       710

#tfidf 

Accuracy: 0.8577464788732394
Classification Report:
              precision    recall  f1-score   support

    Negative       0.68      0.22      0.33       114
    Positive       0.87      0.98      0.92       596

    accuracy                           0.86       710
   macro avg       0.77      0.60      0.63       710
weighted avg       0.84      0.86      0.83       710


IndentationError: unindent does not match any outer indentation level (<tokenize>, line 7)