In [19]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

# Download required datasets
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

# Initialize tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


def clean_text(text):
    match = re.search(r"b'(.*?)'", text)

    if match:
        extracted_text = match.group(1)
        text = extracted_text 
        
    match = re.search(r'b"(.*?)"', text)

    if match:
        extracted_text = match.group(1)
        text = extracted_text 
        
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Convert to lowercase
    text = text.lower()

    # Tokenize the text
    tokens = nltk.word_tokenize(text)

    # Remove stopwords and lemmatize
    cleaned_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

    return cleaned_tokens

df = pd.read_csv("RedditNews.csv")

print(df)
print(df.columns)



             Date                                               News
0      2016-07-01  A 117-year-old woman in Mexico City finally re...
1      2016-07-01   IMF chief backs Athens as permanent Olympic host
2      2016-07-01  The president of France says if Brexit won, so...
3      2016-07-01  British Man Who Must Give Police 24 Hours' Not...
4      2016-07-01  100+ Nobel laureates urge Greenpeace to stop o...
...           ...                                                ...
73603  2008-06-08  b'Man goes berzerk in Akihabara and stabs ever...
73604  2008-06-08  b'Threat of world AIDS pandemic among heterose...
73605  2008-06-08  b'Angst in Ankara: Turkey Steers into a Danger...
73606  2008-06-08  b"UK: Identity cards 'could be used to spy on ...
73607  2008-06-08  b'Marriage, they said, was reduced to the stat...

[73608 rows x 2 columns]
Index(['Date', 'News'], dtype='object')


[nltk_data] Downloading package punkt to /home/sohn31/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/sohn31/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/sohn31/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/sohn31/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [21]:
df['tokens'] = df['News'].apply(clean_text)
print(df)

             Date                                               News  \
0      2016-07-01  A 117-year-old woman in Mexico City finally re...   
1      2016-07-01   IMF chief backs Athens as permanent Olympic host   
2      2016-07-01  The president of France says if Brexit won, so...   
3      2016-07-01  British Man Who Must Give Police 24 Hours' Not...   
4      2016-07-01  100+ Nobel laureates urge Greenpeace to stop o...   
...           ...                                                ...   
73603  2008-06-08  b'Man goes berzerk in Akihabara and stabs ever...   
73604  2008-06-08  b'Threat of world AIDS pandemic among heterose...   
73605  2008-06-08  b'Angst in Ankara: Turkey Steers into a Danger...   
73606  2008-06-08  b"UK: Identity cards 'could be used to spy on ...   
73607  2008-06-08  b'Marriage, they said, was reduced to the stat...   

                                            cleaned_text  \
0      [yearold, woman, mexico, city, finally, receiv...   
1      [imf, ch

In [25]:
df.drop('cleaned_text', axis= 1, inplace=True)

KeyError: "['cleaned_text'] not found in axis"

In [26]:
df.drop('News', axis= 1, inplace=True)

In [27]:
print(df)

             Date                                             tokens
0      2016-07-01  [yearold, woman, mexico, city, finally, receiv...
1      2016-07-01  [imf, chief, back, athens, permanent, olympic,...
2      2016-07-01    [president, france, say, brexit, donald, trump]
3      2016-07-01  [british, man, must, give, police, hour, notic...
4      2016-07-01  [nobel, laureate, urge, greenpeace, stop, oppo...
...           ...                                                ...
73603  2008-06-08  [man, go, berzerk, akihabara, stab, everyone, ...
73604  2008-06-08  [threat, world, aid, pandemic, among, heterose...
73605  2008-06-08  [angst, ankara, turkey, steer, dangerous, iden...
73606  2008-06-08  [uk, identity, card, could, used, spy, people,...
73607  2008-06-08  [marriage, said, reduced, status, commercial, ...

[73608 rows x 2 columns]


In [28]:
# df.to_csv('reddit_news_tokens.csv', index=False) 

In [31]:
dji_stock_price_df = pd.read_csv("DJI_stock_data.csv")
print(dji_stock_price_df)

            Date         Close          High           Low          Open  \
0     2008-06-09  12280.320312  12331.860352  12195.320312  12210.129883   
1     2008-06-10  12289.759766  12369.230469  12206.959961  12277.709961   
2     2008-06-11  12083.769531  12286.669922  12079.129883  12286.339844   
3     2008-06-12  12141.580078  12269.240234  12076.929688  12089.629883   
4     2008-06-13  12307.349609  12310.280273  12144.589844  12144.589844   
...          ...           ...           ...           ...           ...   
2026  2016-06-24  17400.750000  17946.630859  17356.339844  17946.630859   
2027  2016-06-27  17140.240234  17355.210938  17063.080078  17355.210938   
2028  2016-06-28  17409.720703  17409.720703  17190.509766  17190.509766   
2029  2016-06-29  17694.679688  17704.509766  17456.019531  17456.019531   
2030  2016-06-30  17929.990234  17930.609375  17711.800781  17712.759766   

         Volume  
0     266350000  
1     240760000  
2     247120000  
3     260960000

In [39]:
dji_stock_price_df['Close_diff'] = dji_stock_price_df['Close'].shift(-1) - dji_stock_price_df['Close']
dji_stock_price_df['High_diff'] = dji_stock_price_df['High'].shift(-1) - dji_stock_price_df['High']
dji_stock_price_df['Low_diff'] = dji_stock_price_df['Low'].shift(-1) - dji_stock_price_df['Low']
dji_stock_price_df['Open_diff'] = dji_stock_price_df['Open'].shift(-1) - dji_stock_price_df['Open']
dji_stock_price_df['Open_Close_diff'] = dji_stock_price_df['Close'] - dji_stock_price_df['Open']

print(dji_stock_price_df)

            Date         Close          High           Low          Open  \
0     2008-06-09  12280.320312  12331.860352  12195.320312  12210.129883   
1     2008-06-10  12289.759766  12369.230469  12206.959961  12277.709961   
2     2008-06-11  12083.769531  12286.669922  12079.129883  12286.339844   
3     2008-06-12  12141.580078  12269.240234  12076.929688  12089.629883   
4     2008-06-13  12307.349609  12310.280273  12144.589844  12144.589844   
...          ...           ...           ...           ...           ...   
2026  2016-06-24  17400.750000  17946.630859  17356.339844  17946.630859   
2027  2016-06-27  17140.240234  17355.210938  17063.080078  17355.210938   
2028  2016-06-28  17409.720703  17409.720703  17190.509766  17190.509766   
2029  2016-06-29  17694.679688  17704.509766  17456.019531  17456.019531   
2030  2016-06-30  17929.990234  17930.609375  17711.800781  17712.759766   

         Volume  Close_diff   High_diff    Low_diff   Open_diff  \
0     266350000    9

In [40]:
reddit_news_df = pd.read_csv("reddit_news_tokens.csv")

In [41]:
print(reddit_news_df)

             Date                                             tokens
0      2016-07-01  ['yearold', 'woman', 'mexico', 'city', 'finall...
1      2016-07-01  ['imf', 'chief', 'back', 'athens', 'permanent'...
2      2016-07-01  ['president', 'france', 'say', 'brexit', 'dona...
3      2016-07-01  ['british', 'man', 'must', 'give', 'police', '...
4      2016-07-01  ['nobel', 'laureate', 'urge', 'greenpeace', 's...
...           ...                                                ...
73603  2008-06-08  ['man', 'go', 'berzerk', 'akihabara', 'stab', ...
73604  2008-06-08  ['threat', 'world', 'aid', 'pandemic', 'among'...
73605  2008-06-08  ['angst', 'ankara', 'turkey', 'steer', 'danger...
73606  2008-06-08  ['uk', 'identity', 'card', 'could', 'used', 's...
73607  2008-06-08  ['marriage', 'said', 'reduced', 'status', 'com...

[73608 rows x 2 columns]


In [42]:
data_df = pd.merge(reddit_news_df, dji_stock_price_df, on='Date', how='inner')  

In [43]:
print (data_df)

             Date                                             tokens  \
0      2016-06-30  ['jamaica', 'proposes', 'marijuana', 'dispense...   
1      2016-06-30  ['stephen', 'hawking', 'say', 'pollution', 'st...   
2      2016-06-30  ['boris', 'johnson', 'say', 'run', 'tory', 'pa...   
3      2016-06-30  ['six', 'gay', 'men', 'ivory', 'coast', 'abuse...   
4      2016-06-30  ['switzerland', 'denies', 'citizenship', 'musl...   
...           ...                                                ...   
50763  2008-06-09  ['future', 'united', 'state', 'europe', 'hand'...   
50764  2008-06-09  ['military', 'coup', 'zimbabwe', 'mugabe', 'fo...   
50765  2008-06-09  ['rising', 'oil', 'price', 'spark', 'strike', ...   
50766  2008-06-09  ['chvez', 'farc', 'asks', 'end', 'armed', 'str...   
50767  2008-06-09               ['flier', 'pain', 'airline', 'pack']   

              Close          High           Low          Open     Volume  \
0      17929.990234  17930.609375  17711.800781  17712.7597

In [45]:
data_df.drop(['Close', 'High', 'Low', 'Open', 'Volume'], axis=1, inplace=True)

In [46]:
print(data_df)

             Date                                             tokens  \
0      2016-06-30  ['jamaica', 'proposes', 'marijuana', 'dispense...   
1      2016-06-30  ['stephen', 'hawking', 'say', 'pollution', 'st...   
2      2016-06-30  ['boris', 'johnson', 'say', 'run', 'tory', 'pa...   
3      2016-06-30  ['six', 'gay', 'men', 'ivory', 'coast', 'abuse...   
4      2016-06-30  ['switzerland', 'denies', 'citizenship', 'musl...   
...           ...                                                ...   
50763  2008-06-09  ['future', 'united', 'state', 'europe', 'hand'...   
50764  2008-06-09  ['military', 'coup', 'zimbabwe', 'mugabe', 'fo...   
50765  2008-06-09  ['rising', 'oil', 'price', 'spark', 'strike', ...   
50766  2008-06-09  ['chvez', 'farc', 'asks', 'end', 'armed', 'str...   
50767  2008-06-09               ['flier', 'pain', 'airline', 'pack']   

       Close_diff  High_diff   Low_diff  Open_diff  Open_Close_diff  
0             NaN        NaN        NaN        NaN       217.2304

In [47]:
data_df.to_csv('train_raw_data.csv', index=False)

In [4]:
import pandas as pd
reddit_news_df = pd.read_csv("reddit_news_tokens.csv")
max_len = max(len(tokens) for tokens in reddit_news_df['tokens'])
print(max_len)

375
