In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

def parse_line(line):
    parts = line.strip().split('\t')
    if len(parts) == 3:
        id_, sentiment, text = parts
        return {'id': id_, 'sentiment': sentiment, 'text': text.strip('"')}
    return None

def load_data(file_path):
    data = []
    dropped_lines = 0
    with open(file_path, 'r', encoding='utf-8') as file:
        for i, line in enumerate(file, 1):
            parsed = parse_line(line)
            if parsed:
                data.append(parsed)
            else:
                dropped_lines += 1
                if dropped_lines <= 5:  # Print only the first 5 dropped lines
                    print(f"Line {i} didn't match the expected format: {line[:50]}...")

    print(f"Total dropped lines: {dropped_lines}")
    return pd.DataFrame(data)

# Load the data
file_path = '/content/SADATA.txt'
data = load_data(file_path)

# Data exploration
print(data.head())
print(f"\nLoaded {len(data)} rows")
print(data['sentiment'].value_counts())

# Visualize sentiment distribution
import matplotlib.pyplot as plt

data['sentiment'].value_counts().plot(kind='bar')
plt.title('Sentiment Distribution')
plt.savefig('sentiment_distribution.png')
plt.close()

Line 2719 didn't match the expected format: 626437371999977472	neutral	...Yakub may well deser...
Line 4143 didn't match the expected format: 629712146016935936	neutral	Testing Motorola's Moto...
Line 8479 didn't match the expected format: 636047266248265728	negative	"Scott Walker, who thi...
Line 8516 didn't match the expected format: 636112894086782976	neutral	@lindaikeji: Super Eagl...
Line 10201 didn't match the expected format: 637879757124669440	neutral	@Roman_Empire_76:   BRE...
Total dropped lines: 11
                   id sentiment  \
0  619950566786113536   neutral   
1  619969366986235905   neutral   
2  619971047195045888  negative   
3  619974445185302528   neutral   
4  619987808317407232  positive   

                                                text  
0  Picturehouse's, Pink Floyd's, 'Roger Waters: T...  
1  Order Go Set a Watchman in store or through ou...  
2  If these runway renovations at the airport pre...  
3  If you could ask an onstage interview question...  

In [3]:
# Load the data
file_path = '/content/SADATA.txt'
data = load_data(file_path)

print(data.head())
print(f"\nLoaded {len(data)} rows")
print(data['sentiment'].value_counts())

Line 2719 didn't match the expected format: 626437371999977472	neutral	...Yakub may well deser...
Line 4143 didn't match the expected format: 629712146016935936	neutral	Testing Motorola's Moto...
Line 8479 didn't match the expected format: 636047266248265728	negative	"Scott Walker, who thi...
Line 8516 didn't match the expected format: 636112894086782976	neutral	@lindaikeji: Super Eagl...
Line 10201 didn't match the expected format: 637879757124669440	neutral	@Roman_Empire_76:   BRE...
Total dropped lines: 11
                   id sentiment  \
0  619950566786113536   neutral   
1  619969366986235905   neutral   
2  619971047195045888  negative   
3  619974445185302528   neutral   
4  619987808317407232  positive   

                                                text  
0  Picturehouse's, Pink Floyd's, 'Roger Waters: T...  
1  Order Go Set a Watchman in store or through ou...  
2  If these runway renovations at the airport pre...  
3  If you could ask an onstage interview question...  

In [6]:
# Text preprocessing
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [7]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return ' '.join(tokens)

In [5]:
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [8]:
data['processed_text'] = data['text'].apply(preprocess_text)

In [9]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(data['processed_text'], data['sentiment'], test_size=0.2, random_state=42)

In [10]:
# Define models
models = {
    'SVM': LinearSVC(),
    'Naive Bayes': MultinomialNB(),
    'Random Forest': RandomForestClassifier()
}

In [11]:
# Train and evaluate models
results = {}

for name, model in models.items():
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=5000)),
        ('classifier', model)
    ])

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    print(f"\n{name} Classification Report:")
    print(classification_report(y_test, y_pred))

    results[name] = pipeline



SVM Classification Report:
              precision    recall  f1-score   support

    negative       0.56      0.40      0.47       619
     neutral       0.66      0.74      0.70      2090
    positive       0.66      0.62      0.64      1416

    accuracy                           0.65      4125
   macro avg       0.63      0.59      0.60      4125
weighted avg       0.65      0.65      0.65      4125


Naive Bayes Classification Report:
              precision    recall  f1-score   support

    negative       0.63      0.16      0.26       619
     neutral       0.62      0.83      0.71      2090
    positive       0.67      0.55      0.60      1416

    accuracy                           0.63      4125
   macro avg       0.64      0.51      0.52      4125
weighted avg       0.64      0.63      0.60      4125


Random Forest Classification Report:
              precision    recall  f1-score   support

    negative       0.66      0.25      0.36       619
     neutral       0.63    

In [12]:
# Visualize results
model_names = list(results.keys())
accuracies = [classification_report(y_test, model.predict(X_test), output_dict=True)['accuracy'] for model in results.values()]

plt.figure(figsize=(10, 6))
sns.barplot(x=model_names, y=accuracies)
plt.title('Model Accuracies')
plt.ylabel('Accuracy')
plt.savefig('model_accuracies.png')
plt.close()

In [13]:
# Select the best model (highest accuracy)
best_model_name = model_names[np.argmax(accuracies)]
best_model = results[best_model_name]

print(f"\nBest Model: {best_model_name}")



Best Model: SVM


In [14]:
# Confusion Matrix for the best model
y_pred = best_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title(f'Confusion Matrix - {best_model_name}')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.savefig('confusion_matrix.png')
plt.close()

In [15]:
 # Feature importance (if applicable)
if best_model_name in ['SVM', 'Random Forest']:
    tfidf_vectorizer = best_model.named_steps['tfidf']
    classifier = best_model.named_steps['classifier']

    if best_model_name == 'SVM':
        feature_importance = abs(classifier.coef_[0])
    else:  # Random Forest
        feature_importance = classifier.feature_importances_

    feature_names = tfidf_vectorizer.get_feature_names_out()

    # Get top 20 features
    top_features = sorted(zip(feature_names, feature_importance), key=lambda x: x[1], reverse=True)[:20]

    plt.figure(figsize=(12, 8))
    sns.barplot(x=[f[1] for f in top_features], y=[f[0] for f in top_features])
    plt.title(f'Top 20 Important Features - {best_model_name}')
    plt.xlabel('Importance')
    plt.ylabel('Feature')
    plt.tight_layout()
    plt.savefig('feature_importance.png')
    plt.close()

In [16]:
# Save the best model
import joblib
joblib.dump(best_model, 'best_sentiment_model.joblib')

print("\nBest model saved as 'best_sentiment_model.joblib'")


Best model saved as 'best_sentiment_model.joblib'


In [17]:
# Sample prediction
sample_text = "This movie was amazing! I loved every minute of it."
processed_sample = preprocess_text(sample_text)
prediction = best_model.predict([processed_sample])
print(f"\nSample text: {sample_text}")
print(f"Predicted sentiment: {prediction[0]}")


Sample text: This movie was amazing! I loved every minute of it.
Predicted sentiment: positive


In [19]:
# Sample prediction
sample_text = " I loved the Movie "
processed_sample = preprocess_text(sample_text)
prediction = best_model.predict([processed_sample])
print(f"\nSample text: {sample_text}")
print(f"Predicted sentiment: {prediction[0]}")


Sample text:  I loved the Movie 
Predicted sentiment: positive
