In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import datetime
#import lightgbm as lgb
from scipy import stats
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split, cross_val_score
#from wordcloud import WordCloud
from collections import Counter
from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier

In [5]:
!pip install sumy





In [6]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import re
import string
import scipy.sparse as sp

# Define preprocessing function
def preprocess_and_lemmatize(text, custom_stop_words=None):
    if custom_stop_words is None:
        custom_stop_words = []
    stop_words = set(stopwords.words('english')).union(set(custom_stop_words), ENGLISH_STOP_WORDS)
    words = word_tokenize(text.lower())
    words = [word for word in words if word.isalpha() and word not in stop_words]
    pos_tags = pos_tag(words)
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags]
    return ' '.join(lemmatized_words)

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# Define custom stop words specific to media reviews
custom_stop_words = [
    'episode', 'season', 'series', 'show', 'watch', 'view', 'character', 'plot', 
    'scene', 'actor', 'actress', 'director', 'cinema', 'movie', 'film', 
    'drama', 'comedy', 'thriller', 'action', 'perform', 'role', 'cast', 
    'seasons', 'episodes', 'viewers', 'watching', 'stream', 'streaming',
    'television', 'tv', 'netflix', 'hbo', 'amazon', 'prime', 'disney', 
    'spoiler', 'spoilers'
]

# Load the dataset
file_path = r"C:\Users\Aasna\Downloads\All (2).csv" 
df = pd.read_csv(file_path)

# Apply preprocessing and lemmatization to the review column
df['Lemmatized_Review'] = df['Review'].apply(lambda x: preprocess_and_lemmatize(x, custom_stop_words))

# Categorize 'Rating' into bins
bins = [0, 4, 7, 10]
labels = ['Low', 'Medium', 'High']
df['Rating_Category'] = pd.cut(df['Rating'], bins=bins, labels=labels, include_lowest=True)

# One-hot encode the "Region" column
one_hot_encoder = OneHotEncoder(sparse=True)
#encoded_regions = one_hot_encoder.fit_transform(df[['Region']])

# Encode the 'Rating_Category' column as the target
encoder = LabelEncoder()
y = encoder.fit_transform(df['Rating_Category'])

# Best n-gram range
n_gram_range = (1, 1)

# Vectorization with the best n-gram range
tfidf_vectorizer_reviews = TfidfVectorizer(ngram_range=n_gram_range)
reviews_vectorized = tfidf_vectorizer_reviews.fit_transform(df['Lemmatized_Review'])

tfidf_vectorizer_shows = TfidfVectorizer(ngram_range=n_gram_range)
shows_vectorized = tfidf_vectorizer_shows.fit_transform(df['Show'])

# Combine vectorized features and one-hot encoded 'Region'
X = sp.hstack([reviews_vectorized, shows_vectorized])

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [7]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE only on the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# You can check the new distribution of the classes
print("New distribution of classes:")
print(pd.Series(y_train_smote).value_counts())

New distribution of classes:
0    2801
2    2801
1    2801
dtype: int64


In [8]:
# Assuming 'encoder' is your LabelEncoder instance used to encode 'Rating_Category'
categories_to_classes = {category: i for i, category in enumerate(encoder.classes_)}
print("Category to Class mapping:", categories_to_classes)

Category to Class mapping: {'High': 0, 'Low': 1, 'Medium': 2}


In [9]:
mlr = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
mlr.fit(X_train_smote, y_train_smote)

LogisticRegression(max_iter=1000, multi_class='multinomial')

In [10]:
# Predict with Logistic Regression
logreg_predictions = mlr.predict(X_test)
# Predicting classes directly
predicted_classes = mlr.predict(X_test)
print("Predicted classes:")
print(predicted_classes)

# Predicting probabilities
predicted_probabilities = mlr.predict_proba(X_test)
print("\nPredicted probabilities:")
print(predicted_probabilities)

# If you're interested in the probabilities of the positive class (e.g., success = 1)
print("\nProbabilities of the positive class:")
print(predicted_probabilities[:, 1])

Predicted classes:
[2 2 2 0 0 2 0 1 0 1 1 0 0 1 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 1 0 0 2 0
 2 1 2 0 0 0 0 0 0 1 0 1 2 0 1 0 0 0 0 0 0 2 0 2 0 0 0 0 0 0 0 0 0 0 0 1 1
 0 0 2 0 0 2 0 0 0 0 0 2 0 1 0 0 0 0 1 1 0 0 2 0 0 0 2 2 0 0 0 0 1 0 0 1 0
 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 2 0 2 1 1 0 1 0 1 2 0 0 0 2 0 1 0 2 0 0
 0 1 0 0 0 0 0 0 1 0 0 2 1 0 0 0 0 2 0 0 1 0 1 1 0 0 0 2 2 0 0 1 0 0 0 1 0
 0 0 0 0 0 0 2 1 2 0 0 2 0 0 0 0 1 2 1 0 0 0 0 0 2 0 1 0 2 0 0 0 0 2 0 1 2
 0 1 0 2 1 0 0 2 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 2 0 2 0 0 1 1 1 0 2
 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 2 1 0 0 1 2 0 0 2 2 1 1 0 2 1 2 0 0 0 0
 0 0 0 0 1 2 1 0 0 1 1 0 1 0 2 2 0 0 1 0 0 0 0 0 2 0 0 0 0 1 0 1 2 2 0 0 2
 1 0 1 0 0 0 0 0 0 2 0 0 0 0 2 0 2 0 0 0 0 0 0 2 2 0 0 0 0 1 2 0 0 0 0 0 0
 1 1 2 2 1 2 0 0 0 0 0 0 2 0 0 1 0 0 0 0 2 2 0 0 2 0 0 0 1 1 0 0 0 2 0 2 1
 0 2 1 0 0 0 1 0 0 0 2 2 2 0 0 2 0 1 2 0 2 0 0 0 0 0 1 0 2 0 0 0 0 2 0 0 0
 0 0 0 0 0 0 2 2 1 0 0 0 2 2 2 2 0 2 0 2 1 2 0 1 0 0 0 0 0 0 2 0 2 0 0 0 0
 1 0 2

In [11]:
from sklearn.metrics import classification_report, accuracy_score

# Decoding the predictions back to the original labels
predicted_categories = encoder.inverse_transform(y_pred)

# Evaluating the model
print("Accuracy on Test Set: ", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=encoder.classes_))

# Decoding the actual test set categories
actual_categories = encoder.inverse_transform(y_test)

# Printing a comparison
for actual, predicted in zip(actual_categories, predicted_categories):  # Just the first 10 for brevity
    print(f"Actual: {actual}, Predicted: {predicted}")

NameError: name 'y_pred' is not defined

## Pipeline

In [None]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# Ensure necessary NLTK data is downloaded
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

# Custom text preprocessing transformer
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, stop_words=None):
        self.stop_words = set(stopwords.words('english')).union(ENGLISH_STOP_WORDS, stop_words if stop_words else [])
        self.lemmatizer = WordNetLemmatizer()
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        processed_texts = []
        for text in X:
            words = word_tokenize(text.lower())
            words = [word for word in words if word.isalpha() and word not in self.stop_words]
            pos_tags = pos_tag(words)
            lemmatized_words = [self.lemmatizer.lemmatize(word, self.get_wordnet_pos(tag)) for word, tag in pos_tags]
            processed_texts.append(' '.join(lemmatized_words))
        return np.array(processed_texts)
    
    def get_wordnet_pos(self, tag):
        if tag.startswith('J'):
            return wordnet.ADJ
        elif tag.startswith('V'):
            return wordnet.VERB
        elif tag.startswith('N'):
            return wordnet.NOUN
        elif tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN

# Define custom stop words specific to media reviews
custom_stop_words = [
    'episode', 'season', 'series', 'show', 'watch', 'view', 'character', 'plot', 
    'scene', 'actor', 'actress', 'director', 'cinema', 'movie', 'film', 
    'drama', 'comedy', 'thriller', 'action', 'perform', 'role', 'cast', 
    'seasons', 'episodes', 'viewers', 'watching', 'stream', 'streaming',
    'television', 'tv', 'netflix', 'hbo', 'amazon', 'prime', 'disney', 
    'spoiler', 'spoilers'
]

# Load the dataset (update the file path as needed)
df = pd.read_csv(r"C:\Users\m_sha\Documents\Text Analytics\All with regions.csv")

# Preprocessing steps for categorizing 'Rating' into bins and encoding 'Region'
df['Rating_Category'] = pd.cut(df['Rating'], bins=[0, 4, 7, 10], labels=['Low', 'Medium', 'High'], include_lowest=True)

# Setup for preprocessing and model pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('text_review', TfidfVectorizer(preprocessor=lambda x: x, tokenizer=word_tokenize, stop_words=custom_stop_words, ngram_range=(1, 1)), 'Review'),
        ('region', OneHotEncoder(), ['Region'])
    ],
    remainder='drop'
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000))
])

# Encode the target variable
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(df['Rating_Category'])

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(df[['Review', 'Region']], y_encoded, test_size=0.2, random_state=42)

# Training the pipeline
pipeline.fit(X_train, y_train)

# Making predictions
y_pred = pipeline.predict(X_test)
predicted_categories = encoder.inverse_transform(y_pred)

# Output predicted categories
print("Predicted Categories:")
for i, pred in enumerate(predicted_categories[:10]):  # Example: Output first 10 predictions
    print(f"Sample {i+1}: Predicted Category - {pred}")

# Evaluate model performance
print("\nAccuracy on Test Set:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=encoder.classes_))


In [None]:
# Convert predicted_categories back to DataFrame for easy filtering
predicted_df = pd.DataFrame(predicted_categories, columns=['Predicted_Rating'], index=X_test.index)

# Add predicted ratings to the original DataFrame
df_with_predictions = df.join(predicted_df, how='inner')

# Filter reviews based on predicted rating
high_rating_reviews = df_with_predictions[df_with_predictions['Predicted_Rating'] == 'High']['Review']

# Assuming 'Show' is the column name for show names and it's included in df_with_predictions
# Filter for high-rating reviews and group by 'Show'
grouped_reviews = df_with_predictions[df_with_predictions['Predicted_Rating'] == 'High'].groupby('Show')['Review'].apply(' '.join).reset_index()

In [None]:
import pandas as pd

def process_predictions_to_reviews(df, predicted_categories, X_test):
    # Convert predicted_categories back to DataFrame for easy filtering
    predicted_df = pd.DataFrame(predicted_categories, columns=['Predicted_Rating'], index=X_test.index)
    
    # Add predicted ratings to the original DataFrame
    df_with_predictions = df.join(predicted_df, how='inner')
    
    # Filter for high-rating reviews and group by 'Show'
    grouped_reviews = df_with_predictions[df_with_predictions['Predicted_Rating'] == 'High'] \
                                            .groupby('Show')['Review'].apply(' '.join).reset_index()
    
    return grouped_reviews

## Original summarizer

In [None]:

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel


def simple_summarize(text, num_sentences=3):
    sentences = [sent for sent in nltk.sent_tokenize(text)]
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(sentences)
    cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
    avg = np.mean(cosine_sim, axis=1)
    top_idx = np.argsort(avg)[-num_sentences:]
    return ' '.join([sentences[idx] for idx in sorted(top_idx)])

# Assuming 'Show' column exists and 'df_with_predictions' includes 'Show', 'Review', 'Predicted_Rating'
# Filter for high-rating reviews and group by 'Show'
grouped_reviews = df_with_predictions[df_with_predictions['Predicted_Rating'] == 'High'].groupby('Show')['Review'].apply(' '.join).reset_index()

# Summarize reviews for each show
grouped_reviews['Summary'] = grouped_reviews['Review'].apply(lambda text: simple_summarize(text, num_sentences=5))

# Print summaries for each show
for index, row in grouped_reviews.iterrows():
    print(f"Show: {row['Show']}")
    print(f"Summary: {row['Summary']}\n")


## Summaries by Roles

In [None]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer

import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import numpy as np

#nltk.download('punkt')  # Make sure to download necessary NLTK data

def simple_summarize(text, num_sentences=3):
    sentences = nltk.sent_tokenize(text)
    num_sentences = min(num_sentences, len(sentences))
    if len(sentences) <= num_sentences:
        return ' '.join(sentences)
    
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(sentences)
    cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
    avg = np.mean(cosine_sim, axis=1)
    top_idx = np.argsort(avg)[-num_sentences:]
    
    # Now select the sentences and use sumy for further summarization if needed
    selected_sentences = ' '.join([sentences[idx] for idx in sorted(top_idx)])
    parser = PlaintextParser.from_string(selected_sentences, Tokenizer("english"))
    summarizer = TextRankSummarizer()
    summary = summarizer(parser.document, sentences_count=num_sentences)
    summary_text = ' '.join([str(sentence) for sentence in summary])
    return summary_text

def create_role_based_summary(text, role_focus, num_sentences=3):
    sentences = nltk.sent_tokenize(text)
    keywords = {
        'content': ['story', 'character', 'plot', 'engagement'],
        'marketing': ['dub', 'sub', 'media', 'hype', 'viewer'],
        'studio_production': ['production', 'performance', 'direction', 'budget']
    }[role_focus]
    
    # Filter sentences based on keywords relevant to the role
    filtered_sentences = [sentence for sentence in sentences if any(keyword.lower() in sentence.lower() for keyword in keywords)]
    
    # If not enough sentences after filtering, return the joined filtered sentences
    if len(filtered_sentences) <= num_sentences:
        return ' '.join(filtered_sentences)
    
    # Apply TF-IDF and cosine similarity on filtered sentences
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(filtered_sentences)
    cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
    avg = np.mean(cosine_sim, axis=1)
    top_idx = np.argsort(avg)[-num_sentences:]
    
    # Select the most significant sentences based on cosine similarity
    selected_sentences = ' '.join([filtered_sentences[idx] for idx in sorted(top_idx)])
    
    # Further summarize using sumy's TextRankSummarizer
    parser = PlaintextParser.from_string(selected_sentences, Tokenizer("english"))
    summarizer = TextRankSummarizer()
    summary = summarizer(parser.document, sentences_count=num_sentences)
    summary_text = ' '.join([str(sentence) for sentence in summary])
    
    return summary_text

# Example usage
grouped_reviews = df_with_predictions[df_with_predictions['Predicted_Rating'] == 'High'].groupby('Show')['Review'].apply(' '.join).reset_index()
role_focus = 'content'  # Adjust as needed
num_sentences = 2  # Adjust for shorter summaries

grouped_reviews['General_Summary'] = grouped_reviews['Review'].apply(lambda text: simple_summarize(text, num_sentences=num_sentences))
grouped_reviews[f'{role_focus.capitalize()}_Summary'] = grouped_reviews['Review'].apply(lambda text: create_role_based_summary(text, role_focus, num_sentences=num_sentences))

# Print summaries for each show
for index, row in grouped_reviews.iterrows():
    print(f"Show: {row['Show']}")
    print(f"General Summary: {row['General_Summary']}")
    print(f"{role_focus.capitalize()} Summary: {row[f'{role_focus.capitalize()}_Summary']}\n")

# Final Summarizer

In [None]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import numpy as np
from textblob import TextBlob
import re
from nltk.tokenize import sent_tokenize

# Ensure necessary NLTK data is downloaded
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

# Functions for summarization
def filter_positive_sentences(sentences):
    """Returns only sentences with a positive sentiment."""
    positive_sentences = []
    for sentence in sentences:
        blob = TextBlob(sentence)
        if blob.sentiment.polarity > 0.2:  # Positive sentiment
            positive_sentences.append(sentence)
    return positive_sentences

def summarize_with_lsa(positive_sentences, num_sentences=3):
    """Summarizes the text using LSA on filtered positive sentences."""
    if len(positive_sentences) <= num_sentences:
        return ' '.join(positive_sentences)

    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(positive_sentences)
    cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
    avg = np.mean(cosine_sim, axis=1)
    top_idx = np.argsort(avg)[-num_sentences:]
    
    selected_sentences = ' '.join([positive_sentences[idx] for idx in sorted(top_idx)])
    parser = PlaintextParser.from_string(selected_sentences, Tokenizer("english"))
    summarizer = LsaSummarizer()
    summary = summarizer(parser.document, sentences_count=num_sentences)
    summary_text = ' '.join([str(sentence) for sentence in summary])
    return summary_text

def capitalize_sentences(text):
    sentences = sent_tokenize(text)
    capitalized_sentences = [sentence[0].upper() + sentence[1:] for sentence in sentences]
    return ' '.join(capitalized_sentences)

def clean_and_format_summary(text):
    # Capitalize the first letter of each sentence
    text = capitalize_sentences(text)
    
    # Fix spacing around punctuation
    text = re.sub(r'\s+([.,;?!])', r'\1', text)
    
    # Remove ellipses
    text = text.replace('...', '')
    
    # Remove any redundant whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Ensure proper sentence endings - add a period if missing at the end
    if not text.endswith('.'):
        text += '.'
    
    return text

def simple_summarize(text, num_sentences=3):
    sentences = nltk.sent_tokenize(text)
    positive_sentences = filter_positive_sentences(sentences)
    summary_text = summarize_with_lsa(positive_sentences, num_sentences)
    
    # Clean and format summary for professionalism
    summary_text_formatted = clean_and_format_summary(summary_text)
    
    return summary_text_formatted

def create_role_based_summary(text, role_focus, num_sentences=3):
    sentences = nltk.sent_tokenize(text)
    positive_sentences = filter_positive_sentences(sentences)
    
    keywords = {
        'content': ['story', 'character', 'plot', 'engagement'],
        'marketing': ['dub', 'sub', 'media', 'hype', 'viewer'],
        'studio_production': ['production', 'performance', 'direction', 'budget']
    }[role_focus]
    
    role_filtered_sentences = [sentence for sentence in positive_sentences if any(keyword.lower() in sentence.lower() for keyword in keywords)]
    summary_text = summarize_with_lsa(role_filtered_sentences, num_sentences)
    
    # Clean and format summary for professionalism
    summary_text_formatted = clean_and_format_summary(summary_text)
    
    return summary_text_formatted

# Assuming df_with_predictions is defined correctly
# Example usage
grouped_reviews = df_with_predictions[df_with_predictions['Predicted_Rating'] == 'High'].groupby('Show')['Review'].apply(' '.join).reset_index()
role_focus = 'studio_production'  # Adjust as needed
num_sentences = 3  # Adjust for shorter summaries

grouped_reviews['General_Summary'] = grouped_reviews['Review'].apply(lambda text: simple_summarize(text, num_sentences=num_sentences))
grouped_reviews[f'{role_focus.capitalize()}_Summary'] = grouped_reviews['Review'].apply(lambda text: create_role_based_summary(text, role_focus, num_sentences=num_sentences))

# Print summaries for each show
for index, row in grouped_reviews.iterrows():
    print(f"Show: {row['Show']}")
    print(f"General Summary: {row['General_Summary']}")
    print(f"{role_focus.capitalize()} Summary: {row[f'{role_focus.capitalize()}_Summary']}\n")

# Pipeline

In [35]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# Ensure necessary NLTK data is downloaded
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

# Custom text preprocessing transformer
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, stop_words=None):
        self.stop_words = set(stopwords.words('english')).union(ENGLISH_STOP_WORDS, stop_words if stop_words else [])
        self.lemmatizer = WordNetLemmatizer()
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        processed_texts = []
        for text in X:
            words = word_tokenize(text.lower())
            words = [word for word in words if word.isalpha() and word not in self.stop_words]
            pos_tags = pos_tag(words)
            lemmatized_words = [self.lemmatizer.lemmatize(word, self.get_wordnet_pos(tag)) for word, tag in pos_tags]
            processed_texts.append(' '.join(lemmatized_words))
        return np.array(processed_texts)
    
    def get_wordnet_pos(self, tag):
        if tag.startswith('J'):
            return wordnet.ADJ
        elif tag.startswith('V'):
            return wordnet.VERB
        elif tag.startswith('N'):
            return wordnet.NOUN
        elif tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN

# Define custom stop words specific to media reviews
custom_stop_words = [
    'episode', 'season', 'series', 'show', 'watch', 'view', 'character', 'plot', 
    'scene', 'actor', 'actress', 'director', 'cinema', 'movie', 'film', 
    'drama', 'comedy', 'thriller', 'action', 'perform', 'role', 'cast', 
    'seasons', 'episodes', 'viewers', 'watching', 'stream', 'streaming',
    'television', 'tv', 'netflix', 'hbo', 'amazon', 'prime', 'disney', 
    'spoiler', 'spoilers'
]

# Load the dataset (update the file path as needed)
df = pd.read_csv(r"C:\Users\m_sha\Documents\Text Analytics\All with regions.csv")

# Preprocessing steps for categorizing 'Rating' into bins and encoding 'Region'
df['Rating_Category'] = pd.cut(df['Rating'], bins=[0, 4, 7, 10], labels=['Low', 'Medium', 'High'], include_lowest=True)

# Setup for preprocessing and model pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('text_review', TfidfVectorizer(preprocessor=lambda x: x, tokenizer=word_tokenize, stop_words=custom_stop_words, ngram_range=(1, 1)), 'Review'),
        ('region', OneHotEncoder(), ['Region'])
    ],
    remainder='drop'
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000))
])

# Encode the target variable
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(df['Rating_Category'])

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(df[['Review', 'Region']], y_encoded, test_size=0.2, random_state=42)

# Training the pipeline
pipeline.fit(X_train, y_train)

# Making predictions
y_pred = pipeline.predict(X_test)
predicted_categories = encoder.inverse_transform(y_pred)

# Output predicted categories
print("Predicted Categories:")
for i, pred in enumerate(predicted_categories[:10]):  # Example: Output first 10 predictions
    print(f"Sample {i+1}: Predicted Category - {pred}")

# Evaluate model performance
print("\nAccuracy on Test Set:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=encoder.classes_))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\m_sha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\m_sha\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\m_sha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\m_sha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Predicted Categories:
Sample 1: Predicted Category - High
Sample 2: Predicted Category - High
Sample 3: Predicted Category - High
Sample 4: Predicted Category - Medium
Sample 5: Predicted Category - High
Sample 6: Predicted Category - High
Sample 7: Predicted Category - High
Sample 8: Predicted Category - Medium
Sample 9: Predicted Category - Medium
Sample 10: Predicted Category - High

Accuracy on Test Set: 0.766

Classification Report:
               precision    recall  f1-score   support

        High       0.79      0.98      0.88       688
         Low       0.78      0.28      0.41       153
      Medium       0.51      0.31      0.38       159

    accuracy                           0.77      1000
   macro avg       0.69      0.52      0.56      1000
weighted avg       0.75      0.77      0.73      1000



In [80]:
import pandas as pd
import numpy as np
import re
import nltk
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
from nltk.tokenize import sent_tokenize

class ReviewAnalysisPipeline:
    def __init__(self, pipeline, encoder, role_focus='studio_production', num_sentences=3):
        self.pipeline = pipeline
        self.encoder = encoder
        self.role_focus = role_focus
        self.num_sentences = num_sentences
        self.predicted_df = None 
    
    def process_reviews(self, df, X_test):
        # Existing logic to predict and create df_with_predictions
        predicted_categories = self.pipeline.predict(X_test)
        predicted_categories = self.encoder.inverse_transform(predicted_categories) 
        self.predicted_df = pd.DataFrame(predicted_categories, columns=['Predicted_Rating'], index=X_test.index)
        df_with_predictions = df.join(predicted_df, how='inner')

        # Existing logic to filter and group reviews
        grouped_reviews = df_with_predictions[df_with_predictions['Predicted_Rating'] == 'High'] \
                          .groupby('Show')['Review'].apply(' '.join).reset_index()

        # Return both the grouped_reviews and df_with_predictions
        return grouped_reviews, df_with_predictions

    # Summarization functions integrated within the class
    def filter_positive_sentences(self, grouped_reviews):
        positive_sentences = []
        for sentence in sentences:
            blob = TextBlob(sentence)
            if blob.sentiment.polarity > 0.2:
                positive_sentences.append(sentence)
        return positive_sentences

    def summarize_with_lsa(self, positive_sentences):
        if len(positive_sentences) <= self.num_sentences:
            return ' '.join(positive_sentences)

        tfidf = TfidfVectorizer(stop_words='english')
        tfidf_matrix = tfidf.fit_transform(positive_sentences)
        cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
        avg = np.mean(cosine_sim, axis=1)
        top_idx = np.argsort(avg)[-self.num_sentences:]
        
        selected_sentences = ' '.join([positive_sentences[idx] for idx in sorted(top_idx)])
        parser = PlaintextParser.from_string(selected_sentences, Tokenizer("english"))
        summarizer = LsaSummarizer()
        summary = summarizer(parser.document, sentences_count=self.num_sentences)
        return ' '.join([str(sentence) for sentence in summary])
    
    def clean_and_format_summary(self, text, num_sentences):
        text = self.capitalize_sentences(text)
        text = re.sub(r'\s+([.,;?!])', r'\1', text)
        text = text.replace('...', '')
        text = re.sub(r'\s+', ' ', text).strip()
        if not text.endswith('.'):
            text += '.'
        return text

    def capitalize_sentences(self, text, num_sentences):
        sentences = sent_tokenize(text)
        return ' '.join([sentence[0].upper() + sentence[1:] for sentence in sentences])

    def simple_summarize(self, text, num_sentences):
        sentences = nltk.sent_tokenize(text)
        positive_sentences = self.filter_positive_sentences(sentences)
        summary_text = self.summarize_with_lsa(positive_sentences)
        return self.clean_and_format_summary(summary_text)

    def create_role_based_summary(self, text, role_focus, num_sentences):
        sentences = nltk.sent_tokenize(text)
        positive_sentences = self.filter_positive_sentences(sentences)
        keywords = {
            'content': ['story', 'character', 'plot', 'engagement'],
            'marketing': ['dub', 'sub', 'media', 'hype', 'viewer'],
            'studio_production': ['production', 'performance', 'direction', 'budget']
        }[role_focus]
        
        role_filtered_sentences = [sentence for sentence in positive_sentences if any(keyword.lower() in sentence.lower() for keyword in keywords)]
        summary_text = self.summarize_with_lsa(role_filtered_sentences)
        return self.clean_and_format_summary(summary_text)

In [81]:
class ReviewAnalysisPipeline:
    def __init__(self, pipeline, encoder, role_focus='studio_production', num_sentences=3):
        self.pipeline = pipeline
        self.encoder = encoder
        self.role_focus = role_focus
        self.num_sentences = num_sentences
        self.predicted_df = None
    
    def process_reviews(self, df, X_test):
        predicted_categories = self.pipeline.predict(X_test)
        predicted_categories = self.encoder.inverse_transform(predicted_categories) 
        self.predicted_df = pd.DataFrame(predicted_categories, columns=['Predicted_Rating'], index=X_test.index)
        df_with_predictions = df.join(self.predicted_df, how='inner')  # Use self.predicted_df

        grouped_reviews = df_with_predictions[df_with_predictions['Predicted_Rating'] == 'High'] \
                          .groupby('Show')['Review'].apply(' '.join).reset_index()

        return grouped_reviews, df_with_predictions
    
    def summarize_reviews(self, grouped_reviews):
        # Apply general summarization
        grouped_reviews['General_Summary'] = grouped_reviews['Review'].apply(
            lambda text: self.simple_summarize(text))

        # Apply role-focused summarization
        grouped_reviews[f'{self.role_focus.capitalize()}_Summary'] = grouped_reviews['Review'].apply(
            lambda text: self.create_role_based_summary(text, self.role_focus))

        return grouped_reviews

    def filter_positive_sentences(self, sentences):  # Accepts sentences directly
        positive_sentences = []
        for sentence in sentences:
            blob = TextBlob(sentence)
            if blob.sentiment.polarity > 0.2:
                positive_sentences.append(sentence)
        return positive_sentences

    def summarize_with_lsa(self, positive_sentences):
        if len(positive_sentences) <= self.num_sentences:
            return ' '.join(positive_sentences)

        tfidf = TfidfVectorizer(stop_words='english')
        tfidf_matrix = tfidf.fit_transform(positive_sentences)
        cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
        avg = np.mean(cosine_sim, axis=1)
        top_idx = np.argsort(avg)[-self.num_sentences:]
        
        selected_sentences = ' '.join([positive_sentences[idx] for idx in sorted(top_idx)])
        parser = PlaintextParser.from_string(selected_sentences, Tokenizer("english"))
        summarizer = LsaSummarizer()
        summary = summarizer(parser.document, sentences_count=self.num_sentences)
        return ' '.join([str(sentence) for sentence in summary])

    
    def clean_and_format_summary(self, text):
        text = self.capitalize_sentences(text)
        text = re.sub(r'\s+([.,;?!])', r'\1', text)
        text = text.replace('...', '')
        text = re.sub(r'\s+', ' ', text).strip()
        if not text.endswith('.'):
            text += '.'
        return text

    def capitalize_sentences(self, text):
        sentences = sent_tokenize(text)
        return ' '.join([sentence[0].upper() + sentence[1:] for sentence in sentences])

    def get_keywords(self, role_focus):
        return {
            'content': ['story', 'character', 'plot', 'engagement'],
            'marketing': ['dub', 'sub', 'media', 'hype', 'viewer'],
            'studio_production': ['production', 'performance', 'direction', 'budget']
        }.get(role_focus, [])
    
    def simple_summarize(self, text):
        sentences = nltk.sent_tokenize(text)
        positive_sentences = self.filter_positive_sentences(sentences)
        summary_text = self.summarize_with_lsa(positive_sentences)
        return self.clean_and_format_summary(summary_text)

    def create_role_based_summary(self, text, role_focus):
        sentences = nltk.sent_tokenize(text)
        positive_sentences = self.filter_positive_sentences(sentences)
        keywords = self.get_keywords(role_focus)
        
        role_filtered_sentences = [sentence for sentence in positive_sentences if any(keyword.lower() in sentence.lower() for keyword in keywords)]
        summary_text = self.summarize_with_lsa(role_filtered_sentences)
        return self.clean_and_format_summary(summary_text)

In [64]:
# Instantiating the ReviewAnalysisPipeline with the scikit-learn pipeline
review_analysis_pipeline = ReviewAnalysisPipeline(pipeline, encoder, role_focus='studio_production', num_sentences=3)

# Processing reviews to filter and group high-rating ones
grouped_reviews = review_analysis_pipeline.process_reviews(df, X_test)

# Summarizing the grouped reviews
final_summaries = review_analysis_pipeline.summarize_reviews(grouped_reviews)
print(final_summaries)

NameError: name 'predicted_df' is not defined

In [52]:
# Assuming analysis_pipeline is an instance of ReviewAnalysisPipeline
grouped_reviews, df_with_predictions = analysis_pipeline.process_reviews(df, X_test)

In [60]:
analysis_pipeline = ReviewAnalysisPipeline(pipeline, encoder)
df_with_predictions = analysis_pipeline.process_reviews(df, X_test)
# Ensure process_reviews was called to set predicted_df
print(analysis_pipeline.predicted_df.head())

NameError: name 'predicted_df' is not defined

In [58]:
# Assuming simple_summarize is defined and ready to use
grouped_reviews['General_Summary'] = grouped_reviews['Review'].apply(simple_summarize)

NameError: name 'simple_summarize' is not defined

In [71]:
# Iterate through each row in the DataFrame to print the show name and its summaries
for index, row in grouped_reviews.iterrows():
    print(f"Show: {row['Show']}\n")
    print("General Summary:")
    print(row['General_Summary'], "\n")  # Assuming the general summary is stored in this column
    
    # If you have a studio production-focused summary or any other specific summary, print it as well
    if 'Studio_production_Summary' in grouped_reviews.columns:
        print("Studio Production Summary:")
        print(row['Studio_production_Summary'], "\n")
    
    print("------------------------------------------------\n")

Show: Alice in Borderland

General Summary:


KeyError: 'General_Summary'

In [66]:
# Assuming 'simple_summarize' is your summarization function
# Ensure it's defined to accept a string (review text) and returns a summary string

# Apply the summarization function to each row in the 'Review' column
grouped_reviews['General_Summary'] = grouped_reviews['Review'].apply(simple_summarize)

# After this operation, check if the column exists
print(grouped_reviews.columns)  # This should now include 'General_Summary'

NameError: name 'simple_summarize' is not defined

In [74]:
# Assuming 'analysis_pipeline' is an instance of 'ReviewAnalysisPipeline'
final_grouped_reviews = analysis_pipeline.summarize_reviews(grouped_reviews)

TypeError: simple_summarize() takes 2 positional arguments but 3 were given

In [79]:
grouped_reviews, _ = analysis_pipeline.process_reviews(df, X_test)  # Process reviews to get grouped_reviews
final_grouped_reviews = analysis_pipeline.summarize_reviews(grouped_reviews)  # Apply summarization

NameError: name 'predicted_df' is not defined

In [76]:
for index, row in final_grouped_reviews.iterrows():
    print(f"Show: {row['Show']}\n")
    print("General Summary:")
    print(row['General_Summary'], "\n")  # Assuming the general summary is stored in this column
    
    # If you have a specific summary such as studio production, print it as well
    if 'Studio_production_Summary' in final_grouped_reviews.columns:
        print("Studio Production Summary:")
        print(row['Studio_production_Summary'], "\n")
    
    print("-" * 75)  # Separator for readability

NameError: name 'final_grouped_reviews' is not defined