# Author: Jacob Haas
Version: Comp3800 Fall 24


In [None]:
import pandas as pd
import os
import glob
import re
from collections import Counter
import matplotlib.pyplot as plt
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from wordcloud import WordCloud
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

stop_words = set(stopwords.words("english"))

import warnings
warnings.filterwarnings("ignore")

In [None]:
# Concatenate keywords files

output_file = "comp3800f24_keywords.txt"
if not os.path.exists(output_file):
    keywords_files = glob.glob("keywords/*.txt")
    with open(output_file, "w") as outfile:
        for fname in keywords_files:
            with open(fname) as infile:
                outfile.write(infile.read() + "\n")
    print(f"Keywords files concatenated into {output_file}.")
else:
    print(f"{output_file} already exists. Skipping file creation.")

In [None]:
# Load and clean the dataset

tweets_df = pd.read_csv("comp3800f24_tweets.csv", low_memory=False)
tweets_df = tweets_df[tweets_df["type"] == "tweet"]
common_columns = [
    "id", "url", "twitterUrl", "text", "source", "retweetCount",
    "replyCount", "likeCount", "quoteCount", "viewCount",
    "createdAt", "lang", "bookmarkCount", "isReply",
    "inReplyToId", "conversationId", "inReplyToUsername",
    "isPinned", "isRetweet", "isConversationControlled"
]
tweets_df = tweets_df[common_columns]
print("Shape after loading the dataset:", tweets_df.shape)

In [None]:
# Tokenize and preprocess text

def clean_and_tokenize(text):
    # Remove URLs, mentions, and special characters
    text = re.sub(r'http\S+|www\S+|@\S+', '', text)  # Remove URLs and mentions
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)       # Remove special characters
    text = text.lower()                              # Convert to lowercase
    tokens = [word for word in text.split() if word not in stop_words]
    return tokens

tweets_df["processed_text"] = tweets_df["text"].apply(clean_and_tokenize)

# Combine all words into a single list
all_words = [word for tokens in tweets_df["processed_text"] for word in tokens]

# Count word frequencies
word_counts = Counter(all_words).most_common(20)
print(word_counts)


In [None]:
# Bar plot
plt.figure(figsize=(10, 6))
plt.bar(*zip(*word_counts))
plt.xlabel("Words")
plt.ylabel("Frequency")
plt.title("Most Frequent Words")
plt.xticks(rotation=45)
plt.show()

In [None]:
# Sentiment analysis
tweets_df["sentiment"] = tweets_df["text"].apply(lambda x: TextBlob(x).sentiment.polarity)
tweets_df[["text", "sentiment"]].head()

In [None]:
# Wordclouds for positive and negative tweets
positive_text = " ".join(tweets_df[tweets_df["sentiment"] > 0]["text"])
negative_text = " ".join(tweets_df[tweets_df["sentiment"] < 0]["text"])

# Positive Wordcloud
wordcloud_positive = WordCloud(width=800, height=400, background_color="white").generate(positive_text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud_positive, interpolation="bilinear")
plt.title("Positive Tweets Wordcloud")
plt.axis("off")
plt.show()

# Negative Wordcloud
wordcloud_negative = WordCloud(width=800, height=400, background_color="white").generate(negative_text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud_negative, interpolation="bilinear")
plt.title("Negative Tweets Wordcloud")
plt.axis("off")
plt.show()

In [None]:
# Ensure 'retweetCount' is numeric
tweets_df["retweetCount"] = pd.to_numeric(tweets_df["retweetCount"], errors="coerce")
tweets_df = tweets_df.dropna(subset=["retweetCount"])
tweets_df["retweetCount"] = tweets_df["retweetCount"].astype(int)

# Most retweeted tweets sentiment
most_retweeted = tweets_df.nlargest(10, "retweetCount")

# Shorten tweet text for better visualization
most_retweeted["short_text"] = most_retweeted["text"].apply(lambda x: x[:50] + "..." if len(x) > 50 else x)

# Plot
plt.figure(figsize=(12, 8))
plt.barh(most_retweeted["short_text"], most_retweeted["sentiment"], color="skyblue", edgecolor="black")
plt.xlabel("Sentiment", fontsize=12)
plt.ylabel("Tweets", fontsize=12)
plt.title("Sentiment of Most Retweeted Tweets", fontsize=16)
plt.grid(axis="x", linestyle="--", alpha=0.7)

# Adjust ticks and layout
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.tight_layout()
plt.show()

In [None]:
# Parse datetime
tweets_df["createdAt"] = pd.to_datetime(tweets_df["createdAt"])

# Filter tweets with keywords
subset = tweets_df[tweets_df["text"].str.contains("X|Grok", case=False, na=False)]

# Sentiment over time
subset["date"] = subset["createdAt"].dt.date
sentiment_over_time = subset.groupby("date")["sentiment"].mean()

# Plot
plt.figure(figsize=(12, 6))
plt.plot(sentiment_over_time.index, sentiment_over_time.values, label="Sentiment", color="blue")
plt.axvline(pd.to_datetime("2022-04-14"), color="red", linestyle="--", label="Twitter became X")
plt.axvline(pd.to_datetime("2024-10-15"), color="green", linestyle="--", label="Grok Content Usage Announcement")
plt.xlabel("Date")
plt.ylabel("Average Sentiment")
plt.title("Sentiment Over Time Regarding 'X' and 'Grok'")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Vectorize text data
vectorizer = TfidfVectorizer(max_features=1000)
text_features = vectorizer.fit_transform(tweets_df["text"].fillna("")).toarray()

# PCA for dimensionality reduction
pca = PCA(n_components=50)
reduced_features = pca.fit_transform(text_features)

In [None]:
# Predict retweet count
X = reduced_features
y = tweets_df["retweetCount"].fillna(0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3800)

# Train model
model = RandomForestRegressor()
model.fit(X_train, y_train)

# Evaluate model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")