# tweets for each company are saved in a separate csv file, we preprocess each of them and return a list of dataframes, each dataframe for one company.

In [1]:
import pandas as pd
import nltk
import re
import os

In [7]:
example_file = "./tweets/Cummins Inc.csv"

emoji_pattern = re.compile(
    u"(\ud83d[\ude00-\ude4f])|"  # emoticons
    u"(\ud83c[\udf00-\uffff])|"  # symbols & pictographs (1 of 2)
    u"(\ud83d[\u0000-\uddff])|"  # symbols & pictographs (2 of 2)
    u"(\ud83d[\ude80-\udeff])|"  # transport & map symbols
    u"(\ud83c[\udde0-\uddff])"  # flags (iOS)
    "+", flags=re.UNICODE)

def remove_emoji(text):
    return emoji_pattern.sub(r'', text)

def preprocess_tweet(file):
    
    tweets_df = pd.read_csv(file, index_col=0)
    # drop duplicate
    tweets_df.drop_duplicates(inplace=True)
    
    # some special marks
    special_marks = ['&', '\n', '@', '$', '#', '☆', '!', '(', ')'
                     ',', '.', ';', ':', '💰', '✅']
    
    result_series = pd.Series()
    for date_time, text in tweets_df.iloc[:, 0].items():
        # tokenize
        text = remove_emoji(text)
        for sm in special_marks:
            text = text.replace(sm, "")
        tokens = text.split(" ")
        
        # Stemming and lemmatisation
        
        result_series[date_time] = tokens
        
    return result_series

preprocess_tweet(example_file)



2020-12-09 05:08:11    [FIST, gt, Bekins, Van, Lines,, Inc, gt, Brock...
2020-12-09 03:00:24    [HowmetAerospace, announced, that, its, board,...
2020-12-08 18:23:36    [RT, t5monkey, Will, Piers, Morgan, and, Suzi,...
2020-12-08 15:49:32    [Now, Hiring, Corporate, Counsel,, Emissions, ...
2020-12-08 14:18:47    [News, about, our, community, partner, at, Cum...
                                             ...                        
2020-11-30 17:47:52    [RT, Hoosiers4Renew, In, Indiana,, large, empl...
2020-11-30 14:19:41    [Cummins, to, Open, Hydrogen, Fuel, Cell, Plan...
2020-11-30 13:54:54    [Construction, on, the, flagship, of, Bering, ...
2020-11-30 04:48:41    [DieselGensets, Market, value, to, cross, 21, ...
2020-11-30 04:33:20    [PortableGenerators, Market, value, to, hit, 4...
Length: 67, dtype: object

In [8]:
# processing examples
file_list = os.listdir("./tweets")
#分公司放进不同的dataframe吧？
df_list = [preprocess_tweet("./tweets/"+file) for file in file_list]




In [10]:
file_list

['Align Technology.csv',
 'Ameriprise Financial.csv',
 'Aon Plc.csv',
 'CenterPoint Energy.csv',
 'CMCSA.csv',
 'Comcast Corporation.csv',
 'Cummins Inc.csv',
 'Dollar General Corporation.csv',
 'Extra Space Storage.csv',
 'J.P. Morgan.csv',
 'Jack Henry Associates.csv',
 'L3Harris Technologies.csv']