In [1]:
# !pip install spacy
# !python -m spacy download en_core_web_sm

In [2]:
# Import required libraries
import pandas as pd  # For handling and manipulating data in DataFrame
import numpy as np  # For numerical operations
import re  # Regular expression operations for text cleaning
import gensim  # For Word2Vec model (CBOW/Skip-gram)
from sklearn.model_selection import train_test_split  # To split dataset into train and test sets
from sklearn.preprocessing import LabelEncoder  # Normalize numerical data and encode categorical variables
from xgboost import XGBClassifier  # XGBoost classifier for training the model
from sklearn.metrics import classification_report, accuracy_score  # To evaluate model performance
import unicodedata
import spacy

In [3]:
# Load spaCy's pre-trained word vectors
nlp = spacy.load('en_core_web_sm')

# Load the dataset
try:
    data = pd.read_csv('/kaggle/input/tweet-sentiments-dataset/tweet_sentiments_dataset.csv', encoding='utf-8')
except UnicodeDecodeError:
    data = pd.read_csv('/kaggle/input/tweet-sentiments-dataset/tweet_sentiments_dataset.csv', encoding='ISO-8859-1')



In [4]:
data.head()

Unnamed: 0,Text,Tweet_About,Sentiment
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive


In [5]:
# Filter dataset to classify Positive or Negative
data = data[data['Sentiment'].isin(['Positive', 'Negative'])]
data = data.reset_index(drop=True)
print(data.shape, data['Sentiment'].value_counts())



(3548, 3) Sentiment
Positive    2978
Negative     570
Name: count, dtype: int64


In [6]:
# Step 3: Data Preprocessing - Handle text columns 
def preprocess_text_spacy(text):
    """
    Preprocess the input text by:
    - Normalizing and removing special characters
    - Removing stopwords
    - Applying lemmatization
    """
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')
    text = re.sub(r'[^\w\s]', '', text).lower()
    doc = nlp(text)
    processed_text = ' '.join([token.lemma_ for token in doc if not token.is_stop])
    return processed_text

# Apply the preprocessing function to text columns
data['clean_text'] = data['Text'].astype('str').apply(preprocess_text_spacy)
data['clean_tweet_about'] = data['Tweet_About'].astype('str').apply(preprocess_text_spacy)
data['text_processed'] = data[['clean_text', 'clean_tweet_about']].apply(
    lambda row: ' '.join(dict.fromkeys((row['clean_text'] + ' ' + row['clean_tweet_about']).split())), axis=1
)
data.drop(['Text', 'Tweet_About', 'clean_text', 'clean_tweet_about'], axis=1, inplace=True)



In [7]:
# Step 4: Train Word2Vec model using Gensim
def train_word2vec_model(text_data, sg=0):
    """
    Train Word2Vec model using Gensim.
    
    Parameters:
        text_data (list of str): Preprocessed text data.
        sg (int): Training algorithm; 1 for Skip-gram, 0 for CBOW (default).
    
    Returns:
        model: Trained Word2Vec model.
    """
    # Tokenize the text data into sentences of words
    tokenized_text = [sentence.split() for sentence in text_data]
    
    # Train Word2Vec model
    model = gensim.models.Word2Vec(
        sentences=tokenized_text,  # Input data
        vector_size=100,  # Size of the word vectors
        window=5,  # Context window size
        min_count=1,  # Minimum word frequency to consider
        sg=sg,  # 1 for Skip-gram, 0 for CBOW
        workers=4  # Number of worker threads to train the model
    )
    return model


In [8]:
# Choose Skip-gram (sg=1) or CBOW (sg=0)
sg_param = 1  # 1 for Skip-gram, 0 for CBOW
word2vec_model = train_word2vec_model(data['text_processed'], sg=sg_param)


In [9]:
# Step 5: Get Word2Vec Embeddings using the trained model
def get_word2vec_embedding(text, model):
    """
    Convert the input text into Word2Vec embedding using the trained model.
    
    Parameters:
        text (str): The input text to convert to word embeddings.
        model: Trained Word2Vec model.
    
    Returns:
        np.array: Mean of word vectors for the input text.
    """
    tokens = text.split()
    # Retrieve vectors for each token in the text that exists in the vocabulary
    vectors = [model.wv[token] for token in tokens if token in model.wv]
    # Return the mean of the word vectors for the entire document
    if len(vectors) > 0:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)  # Return zero vector if no words match

# Apply the embedding function to the text column
data['word2vec_embeddings'] = data['text_processed'].apply(lambda x: get_word2vec_embedding(x, word2vec_model))


In [10]:
# Drop rows where embeddings could not be created (if the text was empty or invalid)
data = data.dropna(subset=['word2vec_embeddings'])

# Convert list of embeddings to a DataFrame
embeddings_df = pd.DataFrame(data['word2vec_embeddings'].to_list())

# Label encoding
label_encoder = LabelEncoder()
data['sentiment_encoded'] = label_encoder.fit_transform(data['Sentiment'])


In [11]:
# Combine the embeddings with the rest of the data
df_processed = pd.concat([embeddings_df, data[['sentiment_encoded']]], axis=1)

# Step 9: Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    df_processed.drop('sentiment_encoded', axis=1),
    df_processed['sentiment_encoded'],
    test_size=0.2,
    random_state=42,
    stratify=df_processed['sentiment_encoded']
)



In [12]:
# Initialize the XGBoost model
xgb_model = XGBClassifier(learning_rate=0.1, n_estimators=100, max_depth=6, random_state=42)

# Fit the model to the training data
xgb_model.fit(X_train, y_train)

In [13]:
# Predictions
y_pred = xgb_model.predict(X_test)

# Model evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.8408450704225352
              precision    recall  f1-score   support

           0       0.52      0.15      0.23       114
           1       0.86      0.97      0.91       596

    accuracy                           0.84       710
   macro avg       0.69      0.56      0.57       710
weighted avg       0.80      0.84      0.80       710

