In [2]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download("punkt")
nltk.download("stopwords")

# =========================
# Load Dataset
# =========================
df = pd.read_csv("TeslaStock_Dataset_Cleaned.csv")

# =========================
# Step 1: Create Synthetic News Text
# =========================
def generate_news(row):
    if row["Close"] > row["Open"]:
        return "Tesla stock rises due to positive market sentiment"
    elif row["Close"] < row["Open"]:
        return "Tesla stock falls amid investor concerns"
    else:
        return "Tesla stock remains stable with no major movements"

df["News_Text"] = df.apply(generate_news, axis=1)

print("Sample Synthetic News:")
print(df[["Date", "Open", "Close", "News_Text"]].head(), "\n")

# =========================
# Step 2: Clean Text
# =========================
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in stopwords.words("english")]
    return " ".join(tokens)

df["Cleaned_Text"] = df["News_Text"].apply(clean_text)

print("Cleaned Text Sample:")
print(df[["News_Text", "Cleaned_Text"]].head(), "\n")

# =========================
# Step 3: TF-IDF
# =========================
vectorizer = TfidfVectorizer(max_features=10)
X_tfidf = vectorizer.fit_transform(df["Cleaned_Text"])

tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=vectorizer.get_feature_names_out())

print("TF-IDF Features:")
print(tfidf_df.head())

# =========================
# Step 4: Save Result
# =========================
output_file = "Tesla_NLP_Output.csv"
df_final = pd.concat([df, tfidf_df], axis=1)
df_final.to_csv(output_file, index=False)

print(f"\n✅ NLP preprocessing completed and saved to {output_file}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\khizra\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\khizra\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Sample Synthetic News:
         Date       Open      Close  \
0  2015-01-02  14.858000  14.620667   
1  2015-01-05  14.303333  14.006000   
2  2015-01-06  14.004000  14.085333   
3  2015-01-07  14.223333  14.063333   
4  2015-01-08  14.187333  14.041333   

                                           News_Text  
0           Tesla stock falls amid investor concerns  
1           Tesla stock falls amid investor concerns  
2  Tesla stock rises due to positive market senti...  
3           Tesla stock falls amid investor concerns  
4           Tesla stock falls amid investor concerns   

Cleaned Text Sample:
                                           News_Text  \
0           Tesla stock falls amid investor concerns   
1           Tesla stock falls amid investor concerns   
2  Tesla stock rises due to positive market senti...   
3           Tesla stock falls amid investor concerns   
4           Tesla stock falls amid investor concerns   

                                      Cleaned_Text  