# Loading And Fetching Data From Reddit PRAW

In [None]:
SECRET_KEY = 'LyyFU_r17F6s1i0ajYI2dxoSi2dOtw'
CLIENT_ID = 'SMi4R1E-3TeMoXqqwPNwUg'
USER_NAME = 'Weary-Tooth7440'

In [None]:
import praw
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='darkgrid', context='talk', palette='Dark2')

from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
from scipy.special import softmax

%pip install nltk
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize
#Visualizing most frequent words
from nltk.probability import FreqDist
import re

In [None]:
# Initialize Reddit instance
reddit = praw.Reddit(client_id=CLIENT_ID,
                     client_secret=SECRET_KEY,
                     user_agent=USER_NAME)

In [None]:
def get_subreddit_data(subreddit_name):
    subreddit = reddit.subreddit(subreddit_name)

    return subreddit

# Display the name of the Subreddit
stocks_subreddit = get_subreddit_data('stocks')
wall_street_bets_subreddit = get_subreddit_data('wallstreetbets')

In [None]:
def get_subreddit_data(subreddit_names):
    data = []
    for subreddit_name in subreddit_names:
        subreddit = reddit.subreddit(subreddit_name)
        # ... (rest of your existing code to scrape data)
        for post in subreddit.search('daily discussion', sort='new', time_filter='week'):
            if post.num_comments > 0:
                # Scraping comments for each post
                post.comments.replace_more(limit= 5)
                for comment in post.comments.list():
                    data.append({
                        'id': post.id + '_' +  comment.id ,
                       'Author': comment.author.name if comment.author else 'Unknown',
                        'Timestamp': pd.to_datetime(comment.created_utc, unit='s'),
                        'Text': comment.body,
                        'Score': comment.score,
                        'Post_url':post.url,
                    })
    return data

# Example usage:
subreddit_names = ['stocks', 'wallstreetbets']
all_data = get_subreddit_data(subreddit_names)

In [None]:
# Create pandas DataFrame for posts and comments
df = pd.DataFrame(all_data)

# Cleaning The Data

In [None]:
def DropDeletedComment(data):

  #Dropping the text with [deleted] and [removed]
  data = data[~data['Text'].str.contains('\[removed\]|\[deleted\]', na=False, regex = True)]
  data = data.reset_index(drop=True)

  return data

In [None]:
def ReplaceParagraphBrake(data):
  #Replacing the Paragraph Brake
  data['Text'] = data['Text'].str.replace('\n', ' ')
  data = data.reset_index(drop=True)

  return data

In [None]:
def DropSpamComment(data):

  #Dropping the text with spam words
    data = data[~data['Text'].str.contains('\b(free|sale|discount|limited time|offer|buy now|click here)\b', na=False, regex = True)]
    data = data.reset_index(drop=True)

    return data

In [None]:
def RemoveURL(data):
  #Removing the URL
  data['Text'] = data['Text'].str.replace(r'http\S+', '')
  data = data.reset_index(drop=True)

  return data

In [None]:
def RemoveUser(data):
  #Removing the User
  data['Text'] = data['Text'].str.replace(r'@\w+', '')
  data = data.reset_index(drop=True)

  return data

In [None]:
df = DropDeletedComment(df)
df = ReplaceParagraphBrake(df)
df = DropSpamComment(df)
df = RemoveURL(df)
df = RemoveUser(df)

In [None]:
df.shape

# Preprocessing Comment

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

In [None]:
def text_preprocessing(txt):
    # Remove non-word characters and lowercase the text
    txt = re.sub(r'\W+', ' ', txt)
    txt = txt.lower()

    # Tokenize the text
    word_tokens = word_tokenize(txt)

    # Remove stop words
    filtered_words = [w for w in word_tokens if w not in stop_words]

    # Stem or Lemmatize each word
    stemmed_words = [stemmer.stem(w) for w in filtered_words]
    lemmatized_words = [lemmatizer.lemmatize(w) for w in stemmed_words]

    # Join the words back into a single string
    return ' '.join(lemmatized_words)

# Apply the preprocessing function to the DataFrame
df['original_text'] = df['Text']
df['Text'] = df['Text'].apply(text_preprocessing)

# Visualization

In [None]:


# Extracts words into list and count frequency
all_words = ' '.join([text for text in df['Text']])
all_words = all_words.split()
words_df = FreqDist(all_words)

# Extracting words and frequency from words_df object
words_df = pd.DataFrame({'word':list(words_df.keys()), 'count':list(words_df.values())})

# Subsets top 30 words by frequency
words_df = words_df.nlargest(columns="count", n = 30)

words_df.sort_values('count', inplace = True)

# Plotting 30 frequent words
plt.figure(figsize=(20,10))
plt.title("Top 50 Frequent Word")
ax = plt.barh(words_df['word'], width = words_df['count'])
plt.show()

# Training/Predicting

In [None]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

def get_sentiment(text):
    # Tokenize the input text
    encoded_input = tokenizer(text, return_tensors='pt', padding='max_length', truncation=True, max_length=512)  # Pad and truncate to 3625 tokens)
    
    # Get model output
    output = model(**encoded_input)
    
    # Calculate softmax scores
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)

    # Get the predicted label and its score
    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    predicted_label = config.id2label[ranking[0]]
    predicted_score = scores[ranking[0]]

    return predicted_label, predicted_score

In [None]:
# Sort the DataFrame by timestamp in descending order
df = df.sort_values(by='Timestamp', ascending=False)

# Get the latest 1000 rows
latest_df = df.head(10)

results = []
for index, row in latest_df.iterrows():
    text = row['original_text']
    timestamp = row['Timestamp']
    predicted_label, predicted_score = get_sentiment(text)
    results.append([text, timestamp, predicted_label, predicted_score])

    # Debugging information (optional)
    print(f"Text: {text}")
    print(f"Timestamp: {timestamp}")
    print(f"Predicted Label: {predicted_label}")
    print(f"Predicted Score: {predicted_score}\n")

# Create a Pandas DataFrame from the results
results_df = pd.DataFrame(results, columns=['Original Text', 'Timestamp', 'Sentiment', 'Score'])

results_df

In [None]:
#Top Frequent Mentioned Stocks

# List of stocks to track
stocks = ['AAPL', 'MSFT', 'GOOG', 'AMZN', 'TSLA', 'META', 'NVDA', 'CRWD']

# Initialize a dictionary to store counts
stock_counts = {stock: 0 for stock in stocks}

# Iterate through the DataFrame and count mentions
for _, row in results_df.iterrows():
    text = row['Original Text'].lower()
    for stock in stocks:
        if stock.lower() in text:
            stock_counts[stock] += 1

# Create a Pandas Series from the counts
stock_counts_series = pd.Series(stock_counts)

In [None]:
# Sort the Series by count in descending order
stock_counts_series = stock_counts_series.sort_values(ascending=False)

# Plot the top N most frequently mentioned stocks
N = 10  # Change this to the number of stocks you want to display
plt.figure(figsize=(10, 6))
stock_counts_series.head(N).plot(kind='bar')
plt.xlabel('Stock')
plt.ylabel('Frequency')
plt.title('Top {} Frequently Mentioned Stocks'.format(N))
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
Stock_mention = ['CRWD', 'Crowdstrike', 'crowdstrike'] 

# Create a boolean mask for each word
masks = [results_df['Original Text'].str.contains(word, case=False) for word in Stock_mention]

# Combine the masks using logical OR
combined_mask = np.logical_or.reduce(masks)

# Filter the DataFrame using the combined mask
filtered_df = results_df[combined_mask]

# Display the filtered DataFrame
filtered_df.shape

In [None]:
positive_score_sum = filtered_df['Score'].where(filtered_df['Sentiment'] == 'positive').sum()
negative_score_sum = filtered_df['Score'].where(filtered_df['Sentiment'] == 'negative').sum()
neutral_score_sum = filtered_df['Score'].where(filtered_df['Sentiment'] == 'neutral').sum()

data = [positive_score_sum, negative_score_sum, neutral_score_sum]
Sentiments = ['Positive', 'Negative', 'Neutral']
 
# Creating plot
fig = plt.figure(figsize=(10, 5))
plt.title('Sentiment Analysis on ' + (Stock_mention[0]))
plt.pie(data, labels= Sentiments, autopct='%.2f')
 
# show plot
plt.show()