# Data cleaning

In [1]:
!pip install scikit-learn==1.1.3

Collecting scikit-learn==1.1.3
  Downloading scikit_learn-1.1.3-cp39-cp39-win_amd64.whl (7.6 MB)
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.2
    Uninstalling scikit-learn-1.2.2:
      Successfully uninstalled scikit-learn-1.2.2
Successfully installed scikit-learn-1.1.3


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import plotly.graph_objs as go
import plotly.express as px
from plotly.subplots import make_subplots
from sklearn.linear_model import LinearRegression
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

#read .csv data into pd 
data_movie = pd.read_csv('dataset_0.csv');
data_movie

ModuleNotFoundError: No module named 'mlxtend'

In [None]:
data_movie.info

In [None]:
# Find out the missing values
data_movie =data_movie .replace(r'^\s*$',np.nan, regex=True)
data_movie.isnull().sum()

In [None]:
# Delete the missing values
data_movie.dropna(inplace=True)
data_movie

In [None]:
# Find out the missing values
data_movie =data_movie .replace(r'^\s*$',np.nan, regex=True)
data_movie.isnull().sum()

# Filter for action movie and comedy

In [None]:
# Filter action movie and comedy
action = data_movie[data_movie['genre'].isin(['Action'])]
comedy = data_movie[data_movie['genre'].isin(['Comedy'])]

In [None]:
action

In [None]:
comedy

In [None]:
action['movie'].unique()

In [None]:
comedy['movie'].unique()

## Q1.1 Topic modelling of action movie

In [None]:
# Wordcloud

from wordcloud import WordCloud
import matplotlib.pyplot as plt

#aggerate all the tweets into one file and generate the word cloud
review_action = action['review']
all_review = ''.join(review_action.tolist())


fig, ax = plt.subplots()
wordcloud = WordCloud(background_color="white", colormap='tab10', max_words=200).generate(all_review)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()


In [None]:
# Change the text to lower case
review_action = [text.lower() for text in review_action]

#print the first 3 tweets
print(review_action[:3])


In [None]:
# Conduct lemmatization for the words in the text
from nltk.stem import WordNetLemmatizer

tokens=[]
for sent in review_action:
    temp=[WordNetLemmatizer().lemmatize(word) for word in sent.split(" ")]
    tokens.append(temp)

In [None]:
# Customized the stopwords
from sklearn.feature_extraction import text 
my_additional_stop_words = ["movie","just",'film',"one",'action','plot','character','scene']
stop_words = text.ENGLISH_STOP_WORDS.union(my_additional_stop_words)

In [None]:
%%time
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Set how many topics we need
NUM_TOPICS = 10
action['tokens']=tokens
text_train = list(action['tokens'].apply(lambda x: ' '.join(x)))

# Convert a collection of text documents to a matrix of token counts.
## min_df: ignore terms that have a document frequency strictly lower than the given threshold
## max_df: ignore terms that have a document frequency strictly higher than the given threshold
## stop_words: ‘english’, list
## lowercase: Convert all characters to lowercase before tokenizing.
## token_pattern: Regular expression denoting what constitutes a “token”
vectoriser = CountVectorizer(min_df=5, max_df=0.9,
                             stop_words=stop_words,lowercase=True,
                             token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')

data_vectorized= vectoriser.fit_transform(text_train)

# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, random_state=258, learning_method='online')
lda_Z = lda_model.fit_transform(data_vectorized)
 
#text = "mclaughlin: we can win front row qualifying only the start for volvo says young gun"
#x = lda_model.transform(vectorizer.transform([text]))[0]
#print(x, x.sum())

In [None]:
print(lda_Z.shape)

In [None]:
# Show the top 10 most frequent words in each topic¶

def print_topics(model, vectoriser, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx+1))
        print([(vectoriser.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
 
print("LDA Model:")
print_topics(lda_model, vectoriser)
print("=" * 20)

In [None]:
x = lda_model.transform(data_vectorized)
print(x[3])

In [None]:
topics=pd.DataFrame(x)
topics.columns=['topic1','topic2','topic3','topic4','topic5','topic6','topic7','topic8','topic9','topic10']

action = action.reset_index(drop=True)

topics['review'] = action['review']
topics['review_post_date'] = action['review_post_date']


topics

In [None]:
import pyLDAvis.lda_model
 
pyLDAvis.enable_notebook()

# The parameters wee need,
## LDA model: lda_model
## vectorized model: data_vectorized
## matrix of token counts: vectoriser

panel = pyLDAvis.lda_model.prepare(lda_model, data_vectorized, vectoriser, sort_topics = False)
panel

## Q2.1 Sentiment analysis for action movies

In [None]:
import nltk
nltk.download('vader_lexicon')

In [None]:
%%time
# import `SentimentIntensityAnalyzer` and load a model
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sentiment = SentimentIntensityAnalyzer()

In [None]:
# Using the model to process each tweet and call `compound` as polarity score

scores=[]
for tex in topics['review']:
    sentimentResults = sentiment.polarity_scores(tex)
    score = sentimentResults["compound"]
    scores.append(score)

# Show the score of index 1 
scores[1]

In [None]:
# Get the sentiment scores weighted by the topic relevance probability
### Create a new variable named 'topic_senti', and the values are sentiment score * topic relevance probability

topics['topic1_senti'] = topics['topic1'] * scores

In [None]:
# Topic 2
topics['topic5_senti'] = topics['topic5'] * scores

In [None]:
# Calculate the sentiment score for the 2 largest topics for each event day

topic_movie_data = topics.groupby([pd.Grouper('review_post_date')]).agg(topic1_sentiment=('topic1_senti', 'mean'),
                                                                              topic5_sentiment=('topic5_senti','mean'))
topic_movie_data.head()

In [None]:
# Calculate the mean of each column
average_sentiment = topic_movie_data.mean()

print(average_sentiment)

In [None]:
# Overall sentiment scores

from nltk.sentiment.vader import SentimentIntensityAnalyzer
sentiment = SentimentIntensityAnalyzer()
text_tweets = action['review'].to_string(index=False)  
sentiment_scores=sentiment.polarity_scores(text_tweets)
sentiment_scores

In [None]:
merged_data = pd.merge(topic_movie_data, action, left_index=True, right_on='review_post_date')
merged_data.head()

## Q3.1 Regression models of action movies

In [None]:
y=merged_data['box_office_revenue']

# change y to a very small number if it's 0, because denominator cannot be 0
y[y==0]=0.0001

X = merged_data[['num_helpful','num_response','budget','max_screens','topic1_sentiment','topic5_sentiment']]
y = np.log(y)

X2 = sm.add_constant(X)
est = sm.OLS(y, X2)
est2 = est.fit()
print(est2.summary())



In [None]:
import statsmodels.api as sm

def forward_stepwise(X, y, threshold_in):
    initial_features = X.columns.tolist()
    best_features = []
    
    while len(initial_features) > 0:
        remaining_features = list(set(initial_features) - set(best_features))
        new_pval = pd.Series(index=remaining_features)
        
        for new_column in remaining_features:
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[best_features + [new_column]]))).fit()
            new_pval[new_column] = model.pvalues[new_column]
            
        min_p_value = new_pval.min()
        if min_p_value < threshold_in:
            best_features.append(new_pval.idxmin())
        else:
            break
            
    return best_features

# Use the function to get the best features
best_features = forward_stepwise(X, y, 0.05)

print(best_features)


### Gradient Boosting Continuous DV for action  revenue vs other features

In [None]:
from sklearn.model_selection import train_test_split


# Set the variables
x= merged_data[['max_screens', 'topic5_sentiment', 'budget', 'topic1_sentiment', 'num_response']]

y = merged_data['box_office_revenue']
y[y==0]=0.0001
y = np.log(y)

# spilt the training and testing set by 75% and 25% separately
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)

In [None]:
#  load and fit the model
import xgboost as xgb
xgb_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3,        
                  learning_rate = 0.1, max_depth = 5, alpha = 10, n_estimators = 200)
xgb_reg.fit(x_train,y_train)

In [None]:
print(xgb_reg.feature_importances_)

In [None]:
xgb_reg_visual = pd.DataFrame(list(xgb_reg.feature_importances_), 
              columns =['Action Movie Feature Importance'], index=[ 'max_screens', 'topic5_sentiment', 'budget', 'topic1_sentiment', 'num_response']) 

xgb_reg_visual

In [None]:
# Call the 'f_importances' function to visualize feature importances
def f_importances(importance, names):
    sorted_importance = importance.argsort()[::-1]
    sorted_names = [names[i] for i in sorted_importance]
    
    # Create a bar plot
    plt.barh(range(len(names)), importance[sorted_importance], align='center', color=(0.2, 0.4, 0.6, 0.6))
    # Add feature names as labels to x-axis
    plt.yticks(range(len(names)), sorted_names)
    plt.xlabel('Feature Importance Score')
    plt.ylabel('Features')
    plt.title('XGBoost: Continuous DV Model for action movies revenue vs other features')
    plt.show()

In [None]:
# Get feature importances and feature names
importance = xgb_reg.feature_importances_
names = x.columns
f_importances(importance, names)

In [None]:
# error rate
error_rate = 1 - xgb_reg.score(x_test, y_test)
print("Error rate: ", error_rate)

## Q1.2 Topic modelling of comedy movie

In [None]:
# Wordcloud

from wordcloud import WordCloud
import matplotlib.pyplot as plt

#aggerate all the tweets into one file and generate the word cloud
review_comedy = comedy['review']
all_review = ''.join(review_comedy.tolist())


fig, ax = plt.subplots()
wordcloud = WordCloud(background_color="white", colormap='tab10', max_words=200).generate(all_review)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()


In [None]:
# Change the text to lower case
review_comedy = [text.lower() for text in review_comedy]

#print the first 3 tweets
print(review_comedy[:3])


In [None]:
# Conduct lemmatization for the words in the text
from nltk.stem import WordNetLemmatizer

tokens=[]
for sent in review_comedy:
    temp=[WordNetLemmatizer().lemmatize(word) for word in sent.split(" ")]
    tokens.append(temp)

In [None]:
# Customized the stopwords
from sklearn.feature_extraction import text 
my_additional_stop_words = ['movie', 'one', 'film','just','really','comedy','seen','people','like','funny','make','lot','way','think','movies']
stop_words = text.ENGLISH_STOP_WORDS.union(my_additional_stop_words)

In [None]:
import nltk
nltk.download('vader_lexicon')

In [None]:
%%time
# import `SentimentIntensityAnalyzer` and load a model
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sentiment = SentimentIntensityAnalyzer()

In [None]:
%%time
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Set how many topics we need
NUM_TOPICS = 10
comedy['tokens']=tokens
text_train = list(comedy['tokens'].apply(lambda x: ' '.join(x)))

# Convert a collection of text documents to a matrix of token counts.
## min_df: ignore terms that have a document frequency strictly lower than the given threshold
## max_df: ignore terms that have a document frequency strictly higher than the given threshold
## stop_words: ‘english’, list
## lowercase: Convert all characters to lowercase before tokenizing.
## token_pattern: Regular expression denoting what constitutes a “token”
vectoriser = CountVectorizer(min_df=5, max_df=0.9,
                             stop_words=stop_words,lowercase=True,
                             token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')

data_vectorized= vectoriser.fit_transform(text_train)

# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, random_state=258, learning_method='online')
lda_Z = lda_model.fit_transform(data_vectorized)
 
#text = "mclaughlin: we can win front row qualifying only the start for volvo says young gun"
#x = lda_model.transform(vectorizer.transform([text]))[0]
#print(x, x.sum())

In [None]:
print(lda_Z.shape)

In [None]:
# Show the top 10 most frequent words in each topic¶

def print_topics(model, vectoriser, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx+1))
        print([(vectoriser.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
 
print("LDA Model:")
print_topics(lda_model, vectoriser)
print("=" * 20)

In [None]:
x = lda_model.transform(data_vectorized)
print(x[3])

In [None]:
topics=pd.DataFrame(x)
topics.columns=['topic1','topic2','topic3','topic4','topic5','topic6','topic7','topic8','topic9','topic10']

comedy = comedy.reset_index(drop=True)

topics['review'] = comedy['review']
topics['review_post_date'] = comedy['review_post_date']


topics

In [None]:
import pyLDAvis.lda_model
 
pyLDAvis.enable_notebook()

# The parameters wee need,
## LDA model: lda_model
## vectorized model: data_vectorized
## matrix of token counts: vectoriser

panel = pyLDAvis.lda_model.prepare(lda_model, data_vectorized, vectoriser, sort_topics = False)
panel

## Q2.2 Sentiment analysis for comedy movies

In [None]:
import nltk
nltk.download('vader_lexicon')

In [None]:
%%time
# import `SentimentIntensityAnalyzer` and load a model
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sentiment = SentimentIntensityAnalyzer()

In [None]:
# Using the model to process each tweet and call `compound` as polarity score

scores=[]
for tex in topics['review']:
    sentimentResults = sentiment.polarity_scores(tex)
    score = sentimentResults["compound"]
    scores.append(score)

# Show the score of index 1 
scores[1]

In [None]:
topics['review']

In [None]:
# Get the sentiment scores weighted by the topic relevance probability
### Create a new variable named 'topic_senti', and the values are sentiment score * topic relevance probability

topics['topic2_senti'] = topics['topic2'] * scores

In [None]:
# Topic 2
topics['topic5_senti'] = topics['topic5'] * scores

In [None]:
# Calculate the sentiment score for the 2 largest topics for each event day

topic_movie_data = topics.groupby([pd.Grouper('review_post_date')]).agg(topic2_sentiment=('topic2_senti', 'mean'),
                                                                              topic5_sentiment=('topic5_senti','mean'))
topic_movie_data.head()

In [None]:
# Calculate the mean of each column
average_sentiment = topic_movie_data.mean()

print(average_sentiment)

In [None]:
# Overall sentiment scores

from nltk.sentiment.vader import SentimentIntensityAnalyzer
sentiment = SentimentIntensityAnalyzer()
text_tweets = comedy['review'].to_string(index=False)  
sentiment_scores=sentiment.polarity_scores(text_tweets)
sentiment_scores

In [None]:
merged_data = pd.merge(topic_movie_data, comedy, left_index=True, right_on='review_post_date')
merged_data.head()

## Q3.2 Regression models of comedy movies¶

In [None]:
y=merged_data['box_office_revenue']

# change y to a very small number if it's 0, because denominator cannot be 0
y[y==0]=0.0001

X = merged_data[['num_helpful','num_response','budget','max_screens','topic2_sentiment','topic5_sentiment']]
y = np.log(y)

X2 = sm.add_constant(X)
est = sm.OLS(y, X2)
est2 = est.fit()
print(est2.summary())



In [None]:
import statsmodels.api as sm

def forward_stepwise(X, y, threshold_in):
    initial_features = X.columns.tolist()
    best_features = []
    
    while len(initial_features) > 0:
        remaining_features = list(set(initial_features) - set(best_features))
        new_pval = pd.Series(index=remaining_features)
        
        for new_column in remaining_features:
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[best_features + [new_column]]))).fit()
            new_pval[new_column] = model.pvalues[new_column]
            
        min_p_value = new_pval.min()
        if min_p_value < threshold_in:
            best_features.append(new_pval.idxmin())
        else:
            break
            
    return best_features

# Use the function to get the best features
best_features = forward_stepwise(X, y, 0.05)

print(best_features)


### Continuous random forest model for comendy revenue vs other features

In [None]:
from sklearn.model_selection import train_test_split


# Set the variables
x= merged_data[['max_screens', 'budget', 'topic2_sentiment', 'topic5_sentiment']]

y = merged_data['box_office_revenue']
y[y==0]=0.0001
y = np.log(y)

# spilt the training and testing set by 75% and 25% separately
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)

In [None]:
from sklearn.ensemble import RandomForestRegressor
rnd_reg = RandomForestRegressor(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_reg.fit(x_train, y_train)
y_pred = rnd_reg.predict(x_test)

In [None]:
y_pred

In [None]:
y_test

In [None]:
y_comparison=pd.DataFrame({'y_test': y_test,
                           'y_pred': y_pred})
pd.DataFrame(y_comparison).to_csv('y_comparison.csv',index=False)

In [None]:
# Calculate the mean accuracy using the score method
error_rate = 1.4-rnd_reg.score(x_test.values, y_test.values)
print(error_rate)

In [None]:
#calculate RMSE

from sklearn.metrics import mean_squared_error

MSE = mean_squared_error(y_test, y_pred, squared=False)
print(MSE)

In [None]:
df_rnd_visual = pd.DataFrame(list(rnd_reg.feature_importances_), columns=['Feature Importance'], index=['max_screens', 'budget', 'topic2_sentiment', 'topic5_sentiment'])
df_rnd_visual

In [None]:
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestRegressor

# A function to visualise the feature importance or coef
def f_importances(coef, names):
    imp = coef
    imp,names = zip(*sorted(zip(imp,names)))
    plt.barh(range(len(names)), imp, align='center',color = (0.2, 0.4, 0.6, 0.6))
    plt.yticks(range(len(names)), names)
    plt.xlabel('Coefficient rating',fontsize=12)
    plt.ylabel('Features',fontsize=12)
    plt.title("Continuous random forest model for comendy revenue vs other features",fontsize=14)
    plt.show()

features_names = ['max_screens', 'budget', 'topic2_sentiment', 'topic5_sentiment']
rnd_reg = RandomForestRegressor(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_reg.fit(x_train, y_train)
y_pred = rnd_reg.predict(x_test)

rnd_importance = rnd_reg.feature_importances_
f_importances(rnd_importance, features_names)
