In [None]:
# --- Step 1: Install and Import Libraries ---
print("Installing and importing libraries...")
!pip install pandas nltk matplotlib seaborn
import pandas as pd
import nltk
import string
import re
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.util import bigrams
import os

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
print("--- Libraries ready! ---")


# --- Step 2: Load and Prepare Data ---
print("\n--- Step 2: Loading Data ---")
from google.colab import drive
drive.mount('/content/drive')

# !! IMPORTANT: Update the path to your file !!
file_path = '/content/drive/MyDrive/all_musk_posts.csv'

try:
    df = pd.read_csv(file_path, low_memory=False)
    df.columns = df.columns.str.strip() # Clean column names
    # Ensure the text column has no missing values (NaN)
    df['fullText'] = df['fullText'].fillna('')
    print(f"Loaded {len(df)} tweets.")
except Exception as e:
    print(f"ERROR: Could not load file. Check your path. Error: {e}")

# --- Step 3: Define Our Targets (from the "Rosetta Stone") ---
# We define what we are looking for to support our README analysis
TARGET_AFFIRMATIONS = ['yeah', 'true', 'yup']
TARGET_THEMES = ['tesla', 'media']
TARGET_BIGRAMS = [('coming', 'soon'), ('legacy', 'media')]


# --- Step 4: Analysis 1 - Affirmations ("Yeah", "True", "Yup") ---
# For this, we need very light cleaning. We DO NOT remove stopwords.
print("\n--- Step 4: Analyzing Affirmations (Yeah, True, Yup) ---")

def tokenize_raw(text):
    text = str(text).lower() # Only lowercase
    text = text.translate(str.maketrans('', '', string.punctuation)) # Remove punctuation
    tokens = word_tokenize(text)
    return tokens

# Count all words from lightly cleaned text
raw_word_counts = Counter()
df['fullText'].apply(lambda x: raw_word_counts.update(tokenize_raw(x)))

# Pull the results for our targets
affirmation_results = {}
for word in TARGET_AFFIRMATIONS:
    count = raw_word_counts[word]
    affirmation_results[word] = count
    print(f"Found '{word}': {count} times.")


# --- Step 5: Analysis 2 - Thematic Words ("Tesla", "media") ---
# Here, we need full cleaning to remove "noise" (stopwords, links, etc.)
print("\n--- Step 5: Analyzing Thematic Words (Tesla, media) ---")

stop_words = set(stopwords.words('english'))
custom_stopwords = {'rt', 'via', 'amp', 'elon', 'musk'}
stop_words.update(custom_stopwords)

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+', '', text)
    text = re.sub(r'\#\w+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    tokens = word_tokenize(text)
    cleaned_tokens = [word for word in tokens if word not in stop_words and len(word) > 2]
    return cleaned_tokens

# Count all words from fully cleaned text
themed_word_counts = Counter()
df['fullText'].apply(lambda x: themed_word_counts.update(clean_text(x)))

# Pull the results for our targets
theme_results = {}
for word in TARGET_THEMES:
    count = themed_word_counts[word]
    theme_results[word] = count
    print(f"Found '{word}': {count} times.")


# --- Step 6: Analysis 3 - Bigrams ("coming soon", "legacy media") ---
# We look for two-word phrases in the cleaned text
print("\n--- Step 6: Analyzing Bigrams (coming soon, legacy media) ---")

all_cleaned_tokens = []
df['fullText'].apply(lambda x: all_cleaned_tokens.extend(clean_text(x)))

bigram_counts = Counter(bigrams(all_cleaned_tokens))

# Pull the results for our targets
bigram_results = {}
for bigram in TARGET_BIGRAMS:
    count = bigram_counts[bigram]
    bigram_results[bigram] = count
    print(f"Found '{' '.join(bigram)}': {count} times.")

# --- Step 7: Final Visualization (Data for the "Rosetta Stone") ---
# We create one chart that summarizes ALL our key findings
print("\n--- Step 7: Creating final summary chart ---")

# Prepare data for the chart
report_data = {
    'yeah': affirmation_results['yeah'],
    'true': affirmation_results['true'],
    'yup': affirmation_results['yup'],
    'tesla': theme_results['tesla'],
    'media': theme_results['media'],
    'coming soon': bigram_results[('coming', 'soon')],
    'legacy media': bigram_results[('legacy', 'media')]
}

df_report = pd.DataFrame(list(report_data.items()), columns=['Keyword', 'Frequency'])
df_report = df_report.sort_values(by='Frequency', ascending=False)

# Create 'images' folder if it doesn't exist
os.makedirs('images', exist_ok=True)

# Create the plot
plt.figure(figsize=(12, 8))
sns.barplot(x='Frequency', y='Keyword', data=df_report, palette='mako')
plt.title('Frequency of Key "Rosetta Stone" Terms in Musk\'s Tweets', fontsize=16)
plt.xlabel('Total Frequency Count', fontsize=12)
plt.ylabel('Keyword / Phrase', fontsize=12)

# Save the plot
plt.savefig('images/rosetta_stone_keywords.png')
print("Saved final chart to 'images/rosetta_stone_keywords.png'")
plt.show()

# Ta linia została naprawiona:
print('\n--- "Rosetta Stone" Analysis Complete! ---')


