In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import re
import nltk
from nltk.corpus import stopwords
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
nltk.download('stopwords')

# Load dataset 
df = pd.read_csv('/content/drive/MyDrive/MSc. DSA/Module V/DSA 8501 Text and Unstructured Data Analytics/tmdb_5000_movies.csv.gz')  

# Display first few rows
display(df.head())

# Check for missing values
print("Missing values:")
print(df.isnull().sum())

# Print the columns in the dataframe
print(df.columns)

# Replace this with your actual sentiment analysis logic
df['sentiment'] = np.random.choice(['positive', 'negative'], size=len(df)) 

# Plot sentiment distribution with labels
plt.figure(figsize=(6,4))
ax = sns.countplot(x='sentiment', data=df, palette='coolwarm')
plt.title("Sentiment Distribution")
plt.xlabel("Sentiment")
plt.ylabel("Count")

# Add text labels
for p in ax.patches:
    ax.annotate(f'{p.get_height()}', (p.get_x() + p.get_width()/2., p.get_height()), 
                ha='center', va='baseline', fontsize=12, color='black', xytext=(0,5), 
                textcoords='offset points')

plt.show()

# Preprocess text
def clean_text(text):
    # Check if the input is a string
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'[^a-zA-Z0-9]', ' ', text)
        text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])
        return text
    # This will ensure that if its not a string then return it as is or handle it appropriately 
    else:
        return str(text)

df['cleaned_overview'] = df['overview'].apply(clean_text)

# Generate WordCloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(df['cleaned_overview']))
plt.figure(figsize=(10,5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title("Word Cloud of Movie Reviews")
plt.show()

# Analyze Word Frequency
all_words = ' '.join(df['cleaned_overview']).split()
word_freq = Counter(all_words)
most_common_words = word_freq.most_common(20)

# Convert to DataFrame for visualization
word_freq_df = pd.DataFrame(most_common_words, columns=['Word', 'Frequency'])

# Plot most common words
plt.figure(figsize=(10,5))
sns.barplot(x='Frequency', y='Word', data=word_freq_df, palette='viridis')
plt.title("Top 20 Most Common Words in Reviews")
plt.xlabel("Frequency")
plt.ylabel("Words")
plt.show()

# Convert sentiment labels to binary (0 = negative, 1 = positive)
df['sentiment'] = df['sentiment'].map({'negative': 0, 'positive': 1})

# Split dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_overview'], df['sentiment'], test_size=0.2, random_state=42)

# Convert text data into numerical format using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Naïve Bayes Model
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)
nb_predictions = nb_model.predict(X_test_tfidf)

# Train Logistic Regression Model
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_tfidf, y_train)
lr_predictions = lr_model.predict(X_test_tfidf)

# Evaluate models
def evaluate_model(model_name, y_true, y_pred):
    print(f"{model_name} Performance:")
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
    print(f"Precision: {precision_score(y_true, y_pred):.4f}")
    print(f"Recall: {recall_score(y_true, y_pred):.4f}")
    print(f"F1 Score: {f1_score(y_true, y_pred):.4f}")
    print("\nClassification Report:\n", classification_report(y_true, y_pred))
    print("-"*50)

# Display evaluation results
evaluate_model("Naïve Bayes", y_test, nb_predictions)
evaluate_model("Logistic Regression", y_test, lr_predictions)
# Visualize the performance of both models
plt.figure(figsize=(10, 5))