## Data cleaning

In [1]:
!pip install scikit-learn==1.1.3



In [2]:
!pip install mlxtend

Collecting mlxtend
  Downloading mlxtend-0.22.0-py2.py3-none-any.whl (1.4 MB)
Installing collected packages: mlxtend
Successfully installed mlxtend-0.22.0


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
import plotly.graph_objs as go
import plotly.express as px
from plotly.subplots import make_subplots
from sklearn.linear_model import LinearRegression
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

#read .csv data into pd 
data_tweets = pd.read_csv('tweets_sport.csv');
data_tweets

ModuleNotFoundError: No module named 'dash'

In [None]:
data_tweets.info

In [None]:
# Filter event 7,1,10,2
g3_tweets = data_tweets[data_tweets['event_number'].isin([1, 2, 7, 10])]
g3_tweets

In [None]:
# Handling outliers

for i in g3_tweets:
    if g3_tweets[i].dtype in ['int64', 'float64']:  # Check if the column is numeric
        sns.boxplot(x=g3_tweets[i])
        plt.show()

In [None]:
# Find out the missing values.
g3_tweets.isnull().sum()

## Wordcloud

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

#aggerate all the tweets into one file and generate the word cloud
text_tweets3 = g3_tweets['text']
all_tweets = ''.join(text_tweets3.tolist())


fig, ax = plt.subplots()
wordcloud = WordCloud(background_color="white", colormap='tab10', max_words=200).generate(all_tweets)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()


## Topic modelling

In [None]:
# Change the text to lower case
text_tweets3 = [text.lower() for text in text_tweets3]

#print the first 3 tweets
print(text_tweets3[:3])


In [None]:
# Conduct lemmatization for the words in the text
from nltk.stem import WordNetLemmatizer

tokens=[]
for sent in text_tweets3:
    temp=[WordNetLemmatizer().lemmatize(word) for word in sent.split(" ")]
    tokens.append(temp)

In [None]:
# Customized the stopwords
from sklearn.feature_extraction import text 
my_additional_stop_words = ["supercars","racing",'race','sus', 'lens',"http","tas","tcm","darwin","racing","sandown","hay","size", "idr","adelaide","clipsal", "bathurst","adelaide","http","tas","tcm","darwin","coateshire","sandown","clipsal","sydney", "townsville"]
stop_words = text.ENGLISH_STOP_WORDS.union(my_additional_stop_words)

In [None]:
%%time
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Set how many topics we need
NUM_TOPICS = 10
g3_tweets['tokens']=tokens
text_train = list(g3_tweets['tokens'].apply(lambda x: ' '.join(x)))

# Convert a collection of text documents to a matrix of token counts.
## min_df: ignore terms that have a document frequency strictly lower than the given threshold
## max_df: ignore terms that have a document frequency strictly higher than the given threshold
## stop_words: ‘english’, list
## lowercase: Convert all characters to lowercase before tokenizing.
## token_pattern: Regular expression denoting what constitutes a “token”
vectoriser = CountVectorizer(min_df=5, max_df=0.9,
                             stop_words=stop_words,lowercase=True,
                             token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')

data_vectorized= vectoriser.fit_transform(text_train)

# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, random_state=258, learning_method='online')
lda_Z = lda_model.fit_transform(data_vectorized)
 
#text = "mclaughlin: we can win front row qualifying only the start for volvo says young gun"
#x = lda_model.transform(vectorizer.transform([text]))[0]
#print(x, x.sum())

In [None]:
print(lda_Z.shape)

## Show the top 10 most frequent words in each topic 

In [None]:
def print_topics(model, vectoriser, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx+1))
        print([(vectoriser.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
 
print("LDA Model:")
print_topics(lda_model, vectoriser)
print("=" * 20)

In [None]:
x = lda_model.transform(data_vectorized)
print(x[3])

In [None]:
g3_tweets['text']

In [None]:
g3_tweets

In [None]:
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')

# Initialize VADER
sia = SentimentIntensityAnalyzer()

# Assuming g3_tweets is your DataFrame and 'text' is the column with the tweet text
g3_tweets['sentiment_scores'] = g3_tweets['text'].apply(lambda text: sia.polarity_scores(str(text))['compound'])

g3_tweets

In [None]:
topics=pd.DataFrame(x)
topics.columns=['topic1','topic2','topic3','topic4','topic5','topic6','topic7','topic8','topic9','topic10']

g3_tweets = g3_tweets.reset_index(drop=True)

topics['tweet'] = g3_tweets['text']
topics['month'] = g3_tweets['month']
topics['day'] = g3_tweets['day']


topics

In [None]:
import pyLDAvis.lda_model
 
pyLDAvis.enable_notebook()

# The parameters wee need,
## LDA model: lda_model
## vectorized model: data_vectorized
## matrix of token counts: vectoriser

panel = pyLDAvis.lda_model.prepare(lda_model, data_vectorized, vectoriser, sort_topics = False)
panel

## Sentiment analysis

In [None]:
import nltk
nltk.download('vader_lexicon')

In [None]:
%%time
# import `SentimentIntensityAnalyzer` and load a model
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sentiment = SentimentIntensityAnalyzer()
sentiment 

In [None]:
# Using the model to process each tweet and call `compound` as polarity score

scores=[]
for tex in topics['tweet']:
    sentimentResults = sentiment.polarity_scores(tex)
    score = sentimentResults["compound"]
    scores.append(score)

# Show the score of index 1 
scores[1]

In [None]:
# Get the sentiment scores weighted by the topic relevance probability
### Create a new variable named 'topic_senti', and the values are sentiment score * topic relevance probability

for i in range(1, 11):
    topics[f'topic{i}_senti'] = topics[f'topic{i}'] * scores


In [None]:
# Calculate the sentiment score for the 2 largest topics for each event day

grouped_agg_dict = {}

for i in range(1, 11):
    grouped_agg_dict[f'topic{i}_sentiment'] = (f'topic{i}_senti', 'mean')

topic_tweet_data = topics.groupby(['month', 'day']).agg(**grouped_agg_dict)


In [None]:
# reset_index to break the pivot table to normal table
topic_tweet_dataa = topic_tweet_data.reset_index()
topic_tweet_data

In [None]:
# Initialize an empty dictionary to hold the new columns
new_columns = {}

# Loop over the topics
for i in range(1, 11):  # replace 11 with the number of topics + 1
    column_name = f'topic{i}_senti'
    new_column_name = f'topic{i}_senti_day'
    new_columns[new_column_name] = (column_name, 'mean')

# Group by month and day and calculate the mean sentiment score for each topic
topic_tweet_data = topics.groupby(['month', 'day']).agg(**new_columns)

topic_tweet_data

In [None]:
# Calculate the mean of each column
average_sentiment = topic_tweet_data.mean()

print(average_sentiment)

In [None]:
topic_tweet_data.head()

In [None]:
# Overall sentiment scores

from nltk.sentiment.vader import SentimentIntensityAnalyzer
sentiment = SentimentIntensityAnalyzer()
text_tweets = g3_tweets['text'].to_string(index=False)  
sentiment_scores=sentiment.polarity_scores(text_tweets)
sentiment_scores

## TV Combining topic modelling with sentiment analysis

In [None]:
# Add sentiment scores to g3_tweets DataFrame
g3_tweets['sentiment_scores'] = g3_tweets['text'].apply(lambda text: sia.polarity_scores(str(text))['compound'])

# Convert 'time' column to datetime and set it as index
g3_tweets['time'] = pd.to_datetime(g3_tweets['time'])
g3_tweets.set_index('time', inplace=True)

# Group data and aggregate
group_tweet_data = g3_tweets.groupby(['event_number', 'weekday', pd.Grouper(freq='15min')]).agg(
    number_of_tweet=('text', 'count'),
    number_of_player=('player_dummy', 'sum'),
    number_of_team=('team_dummy', 'sum'),
    sentiment_score=('sentiment_scores', 'mean')  # calculate the average sentiment score
)

In [None]:
# reset_index to break the pivot table to normal table
group_tweet_data = group_tweet_data.reset_index()
group_tweet_data.head()

In [None]:
# create a new column 'start_time' to match the tv_rating: time + 15min
import datetime as dt
group_tweet_data['start_time'] = group_tweet_data['time'] + dt.timedelta(minutes=15)
group_tweet_data

In [None]:
# because the datatype of 'time' and 'start_time' are timestamp, we need to change to string
# change the timestamp to string

group_tweet_data['time'] = group_tweet_data['time'].apply(lambda x: x.strftime('%H:%M'))
group_tweet_data['start_time'] = group_tweet_data['start_time'].apply(lambda x: x.strftime('%H:%M'))

group_tweet_data

In [None]:
#read .csv data into pd 
tv_data = pd.read_csv('tv_rating_new.csv');
tv_data

In [None]:
merged_data = pd.merge(tv_data, group_tweet_data, how='left',
                       left_on=['start_time','event_number','weekday'],
                       right_on=['start_time','event_number','weekday'])

# drop the null value
merged_data = merged_data.dropna()
merged_data

In [None]:
merged_data['event_number'].value_counts()

In [None]:
merged_data.dtypes

## Regression models 1 : tweet vs all

In [None]:
# Tv viewers vs other features

X = merged_data[['number_of_player','number_of_team','sentiment_score','weekday']]

y= merged_data['number_of_tweet']
y[y==0]=0.0001
y = np.log(y)

X2 = sm.add_constant(X)
est = sm.OLS(y, X2)
est2 = est.fit()
print(est2.summary())


## Regression models 2 : tv viewers vs all

In [None]:
# Tv viewers vs other features

X = merged_data[['number_of_tweet','number_of_player','number_of_team','sentiment_score','weekday']]

y= merged_data['tvviewers']
y[y==0]=0.0001
y = np.log(y)

X2 = sm.add_constant(X)
est = sm.OLS(y, X2)
est2 = est.fit()
print(est2.summary())


## Benchmark mode SFS :  tv viewers vs all

In [None]:
# Create a linear regression object
lr = LinearRegression()

#Perform forward selection or backward elimination
sfs = SFS(lr, 
          k_features='best',  # 'best' for forward selection or 'parsimonious' for backward elimination
          forward=True,  # Set True for forward selection or False for backward elimination
          floating=False, 
          scoring='r2',
          cv=5)  # cross-validation

# Fit the object to the data
sfs = sfs.fit(X, y)

# Print the selected features
print('Selected features:', sfs.k_feature_names_)

# Train the model with the selected features
lr.fit(X[list(sfs.k_feature_names_)], y)


In [None]:
X.columns

In [None]:
# Use only the selected features from the stepwise selection
X_selected = X[['number_of_tweet','number_of_player','number_of_team','sentiment_score']]

# Fit the model
lr.fit(X_selected, y)


In [None]:
# Make predictions
y_pred = lr.predict(X_selected)
y_pred

In [None]:
from sklearn.metrics import mean_squared_error

# Calculate MSE
mse = mean_squared_error(y, y_pred)
print('MSE:', mse)


In [None]:
# Print the coefficient of the model
print('Coefficient:', lr.coef_)


In [None]:
import statsmodels.api as sm

def forward_stepwise(X, y, threshold_in):
    initial_features = X.columns.tolist()
    best_features = []
    
    while len(initial_features) > 0:
        remaining_features = list(set(initial_features) - set(best_features))
        new_pval = pd.Series(index=remaining_features)
        
        for new_column in remaining_features:
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[best_features + [new_column]]))).fit()
            new_pval[new_column] = model.pvalues[new_column]
            
        min_p_value = new_pval.min()
        if min_p_value < threshold_in:
            best_features.append(new_pval.idxmin())
        else:
            break
            
    return best_features

# Use the function to get the best features
best_features = forward_stepwise(X, y, 0.05)

print(best_features)


## TV viewers vs Tweets number：random forest benchmark

In [None]:
from sklearn.model_selection import train_test_split

X= merged_data[['number_of_tweet','number_of_player','number_of_team']]
y= merged_data['tvviewers']
y = np.log(y)

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [None]:
from sklearn.ensemble import RandomForestRegressor
rnd_reg = RandomForestRegressor(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_reg.fit(x_train, y_train)
y_pred = rnd_reg.predict(x_test)

In [None]:
y_pred

In [None]:
y_test

In [None]:
y_comparison=pd.DataFrame({'y_test': y_test,
                           'y_pred': y_pred})
pd.DataFrame(y_comparison).to_csv('y_comparison.csv',index=False)

In [None]:
# Calculate the mean accuracy using the score method
error_rate = 1-rnd_reg.score(x_test.values, y_test.values)
print(error_rate)

In [None]:
#calculate RMSE

from sklearn.metrics import mean_squared_error

MSE = mean_squared_error(y_test, y_pred, squared=False)
print(MSE)

In [None]:
df_rnd_visual = pd.DataFrame(list(rnd_reg.feature_importances_), columns=['Feature Importance'], index=['number_of_tweet','number_of_player','number_of_team','Weekday'])
df_rnd_visual

In [None]:
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestRegressor

# A function to visualise the feature importance or coef
def f_importances(coef, names):
    imp = coef
    imp,names = zip(*sorted(zip(imp,names)))
    plt.barh(range(len(names)), imp, align='center',color = (0.2, 0.4, 0.6, 0.6))
    plt.yticks(range(len(names)), names)
    plt.xlabel('Coefficient rating',fontsize=12)
    plt.ylabel('Features',fontsize=12)
    plt.title('Continuous random forest model for number of TV viewers vs Tweets',fontsize=14)
    plt.show()

features_names = ['number_of_tweet','number_of_player','number_of_team','Weekday']
rnd_reg = RandomForestRegressor(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_reg.fit(x_train, y_train)
y_pred = rnd_reg.predict(x_test)

rnd_importance = rnd_reg.feature_importances_
f_importances(rnd_importance, features_names)


## TV viewers vs Tweets number：SVM Continuous DV

In [None]:
x = merged_data[['weekday','number_of_player','number_of_team','sentiment_score']]
y= merged_data['number_of_tweet']
y = np.log(y)

In [None]:
# spilt the training and testing set by 80% and 20% separately
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=0)

In [None]:
# setting the features
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR

SVR = Pipeline([
         ("scaler", StandardScaler(with_mean=False)), 
         ('svr', SVR(epsilon=0.2)), 
     ])
SVR.fit(x_train, y_train)

In [None]:
# Show the feature importance
from sklearn import svm

svr = svm.SVR(kernel='linear',max_iter=10000000)
svr.fit(x_train,y_train)
svr.coef_

In [None]:
SVR_clf = pd.DataFrame(list(zip(list(svr.coef_[0]))), 
              columns =['SVR coefficient rating'], index=['weekday','number_of_player','number_of_team','sentiment_score']) 

SVR_clf

In [None]:
y_pred_reg=svr.predict(x_test)
R2=svr.score(x_test,y_test)

In [None]:
from matplotlib import pyplot as plt
from sklearn.svm import SVR

# A function to visualise the feature importance or coef
def f_importances(coef, names):
    imp = coef
    imp,names = zip(*sorted(zip(imp,names)))
    plt.barh(range(len(names)), imp, align='center',color = (0.2, 0.4, 0.6, 0.6))
    plt.yticks(range(len(names)), names)
    plt.xlabel('Coefficient rating',fontsize=12)
    plt.ylabel('Feature names',fontsize=12)
    plt.title('SVM Continuous DV for number of tweet vs others',fontsize=14)
    plt.show()

features_names = ['weekday','number_of_player','number_of_team','sentiment_score']
svr_model = SVR(kernel='linear',max_iter=10000000)
svr_model.fit(x_train,y_train)
svr_model.coef_
f_importances(svr_model.coef_[0], features_names)

In [None]:
# error rate
error_rate =1- svr_model.score(x_test,y_test)
print(error_rate)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

columns=['MSE','MAE','RMSE','R-squared'] 
rows=['SVR for tweet vs others']

results=pd.DataFrame(0.0, columns=columns, index=rows) 

# step3: get prediction
svr = SVR() 

# create an instance of SVR class svr.fit(X=x_train,y=y_train) 

results.iloc[0,0]= mean_squared_error(y_test,y_pred_reg)
results.iloc[0,1]= mean_absolute_error(y_test,y_pred_reg)
results.iloc[0,2]= np.sqrt(results.iloc[0,0])
results.iloc[0,3]= r2_score(y_test,y_pred_reg)

results

In [None]:
# Scatter plot
plt.scatter(y_test, y_pred_reg, color=(0.2, 0.4, 0.6, 0.6))
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs. Predicted Values for tweet vs others')

# Add trend line
coefficients = np.polyfit(y_test, y_pred_reg, 1)  # Fit a first-degree polynomial (linear regression)
trendline = np.poly1d(coefficients)
plt.plot(y_test, trendline(y_test), color='red')

plt.show()


## TV viewers vs Start time

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Specify the events to display
events = [1, 2, 7, 10]

# Filter the data for the specified events
filtered_data = merged_data[merged_data['event_number'].isin(events)]

# Specify the variables to display
variables = ['tvviewers', 'number_of_tweet', 'number_of_player', 'number_of_team']

# Loop through each variable and plot a bar chart
for variable in variables:
    # Create a subplot layout
    fig, ax = plt.subplots(figsize=(10, 6))

    # Group by event_number and calculate the sum of the variable for each event
    event_data = filtered_data.groupby('event_number')[variable].sum()

    # Plot the bar chart with no whitespace between bars
    bars = ax.bar(range(len(events)), event_data.values, width=0.8, color=(0.2, 0.4, 0.6, 0.6))

    # Set the axis labels and title
    ax.set_xlabel('Event Number')
    ax.set_ylabel('Quantity')
    ax.set_title(f'{variable.capitalize()} by Event Number')

    # Set the x-axis ticks and tick labels to display only the specified events
    ax.set_xticks(range(len(events)))
    ax.set_xticklabels(events)

    # Add text annotations on top of each bar
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width() / 2, height, str(int(height)), ha='center', va='bottom')

    plt.show()
