In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import gensim
import re
import nltk
from tqdm import tqdm

from bokeh.plotting import figure, output_file, show
from bokeh.models import Label
from bokeh.io import output_notebook
output_notebook()

In [2]:
%%writefile clean_tokenizer.py
import pandas as pd
import numpy as np
import re
import nltk
import gensim
import tqdm

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tqdm.notebook import trange, tqdm


#Download nltk word libraries if not present
#nltk.download('all')

punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@'         # define a string of punctuation symbols

# Functions to clean tweets
def remove_links(tweet):
    """Takes a string and removes web links from it"""
    tweet = re.sub(r'http\S+', '', tweet)   # remove http links
    tweet = re.sub(r'bit.ly/\S+', '', tweet)  # remove bitly links
    tweet = tweet.strip('[link]')   # remove [links]
    tweet = re.sub(r'pic.twitter\S+','', tweet)
    return tweet

def remove_users(tweet):
    """Takes a string and removes retweet and @user information"""
    tweet = re.sub('(RT\s@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet)  # remove re-tweet
    tweet = re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet)  # remove tweeted at
    return tweet

def remove_hashtags(tweet):
    """Takes a string and removes any hash tags"""
    tweet = re.sub('(#[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet)  # remove hash tags
    return tweet

def remove_av(tweet):
    """Takes a string and removes AUDIO/VIDEO tags or labels"""
    tweet = re.sub('VIDEO:', '', tweet)  # remove 'VIDEO:' from start of tweet
    tweet = re.sub('AUDIO:', '', tweet)  # remove 'AUDIO:' from start of tweet
    return tweet

def lemmatize(tweet):
    """Returns tokenized representation of words in lemma form excluding stopwords"""
    result = []
    for token in gensim.utils.simple_preprocess(tweet):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(
                token) > 3:  # drops words with 3 or less characters
            result.append(lemmatize_stemming(token))
    return result


def lemmatize_stemming(token):
    """Returns lemmatization of a token"""
    return WordNetLemmatizer().lemmatize(token, pos='v')


def clean_tweet(tweet, bigrams=False):
    """Main master function to clean tweets, stripping noisy characters and tokenizing use lemmatization"""
    tweet = remove_users(tweet)
    tweet = remove_links(tweet)
    tweet = remove_hashtags(tweet)
    tweet = remove_av(tweet)
    tweet = tweet.lower()  # lower case
    tweet = re.sub('[' + punctuation + ']+', ' ', tweet)  # strip punctuation
    tweet = re.sub('\s+', ' ', tweet)  # remove double spacing
    tweet = re.sub('([0-9]+)', '', tweet)  # remove numbers

    tweet_token_list = lemmatize(tweet)  # apply lemmatization and tokenization

    if bigrams:
        tweet_token_list = tweet_token_list + [tweet_token_list[i] + '_' + tweet_token_list[i + 1]
                                               for i in range(len(tweet_token_list) - 1)]
    tweet = ' '.join(tweet_token_list)
    return tweet


def basic_clean(tweet):
    """Main master function to clean tweets only without tokenization or removal of stopwords"""
    tweet = remove_users(tweet)
    tweet = remove_links(tweet)
    tweet = remove_hashtags(tweet)
    tweet = remove_av(tweet)
    tweet = tweet.lower()  # lower case
    tweet = re.sub('[' + punctuation + ']+', ' ', tweet)  # strip punctuation
    tweet = re.sub('\s+', ' ', tweet)  # remove double spacing
    tweet = re.sub('([0-9]+)', '', tweet)  # remove numbers
    tweet = re.sub('📝 …', '', tweet)
    return tweet


def tokenize_tweets(data_path):
    """Main function to read in and return clean data set.
    This can be used in Jupyter notebooks by importing this module and calling the tokenize_tweets() function

    Args:
        data_path = path to input data set .csv file

    Returns:
        pandas data frame with cleaned tokens
    """

    tweets_df = pd.read_csv(data_path)
    tweets_df['clean_tweet'] = tweets_df.text.apply(clean_tweet)
    num_tweets = len(tweets_df)
    print('Complete. Number of Tweets that have been cleaned and tokenized : {}'.format(num_tweets))
    return tweets_df

Overwriting clean_tokenizer.py


In [7]:
from clean_tokenizer import tokenize_tweets
tweets_df = tokenize_tweets('patient(MT).csv')
tweets_df.to_csv('patient(MT)_clean.csv')

Complete. Number of Tweets that have been cleaned and tokenized : 195476


In [7]:
from clean_tokenizer import tokenize_tweets

# tweets_df_2020 = tokenize_tweets('PCS_patient/PCS_patient_2020.csv')
# tweets_df_2021 = tokenize_tweets('PCS_patient/PCS_patient_2021.csv')
# tweets_df_2022 = tokenize_tweets('PCS_patient/PCS_patient_2022.csv')
# tweets_df_2023 = tokenize_tweets('PCS_patient/PCS_patient_2023.csv')

# tweets_df_2020.to_csv('PCS_patient_2020_clean.csv')
# tweets_df_2021.to_csv('PCS_patient_2021_clean.csv')
# tweets_df_2022.to_csv('PCS_patient_2022_clean.csv')
# tweets_df_2023.to_csv('PCS_patient_2023_clean.csv')


# tweets_df_2020 = tokenize_tweets('PCS_patient/PCS_non_patient_2020.csv')
# tweets_df_2021 = tokenize_tweets('PCS_patient/PCS_non_patient_2021.csv')
# tweets_df_2022 = tokenize_tweets('PCS_patient/PCS_non_patient_2022.csv')
# tweets_df_2023 = tokenize_tweets('PCS_patient/PCS_non_patient_2023.csv')

# tweets_df_2020.to_csv('PCS_non_patient_2020_clean.csv')
# tweets_df_2021.to_csv('PCS_non_patient_2021_clean.csv')
# tweets_df_2022.to_csv('PCS_non_patient_2022_clean.csv')
# tweets_df_2023.to_csv('PCS_non_patient_2023_clean.csv')
df1=pd.read_csv("PCS_patient_clean/PCS_patient_2020_clean.csv")
df2=pd.read_csv("PCS_patient_clean/PCS_non_patient_2020_clean.csv")


In [8]:
# tweets_df_2023_p = tokenize_tweets('PCS_patient_clean/PCS_patient/PCS_patient_2023_clean.csv')
# tweets_df_2023_p_1=tweets_df1[tweets_df1['month'].isin([1])]
# tweets_df_2020.to_csv('PCS_patient_2020.csv')


In [63]:
#create file name
filelist=[]
for i in range(8):
    if i < 4:
        filelist.append('PCS_patient_202'+str(i)+'_clean.csv')
    if i >= 4:
        filelist.append('PCS_non_patient_202'+str(i-4)+'_clean.csv')
# print(filelist)
# ['PCS_patient_2020_clean.csv', 'PCS_patient_2021_clean.csv', 'PCS_patient_2022_clean.csv', 'PCS_patient_2023_clean.csv', 
#  'PCS_non_patient_2020_clean.csv', 'PCS_non_patient_2021_clean.csv', 'PCS_non_patient_2022_clean.csv', 'PCS_non_patient_2023_clean.csv']
createVar = locals()
myVarList = [] # 存放自己创建的变量
for i in range(len(filelist)):
    if i<1:
        createVar[filelist[i]] = pd.read_csv('PCS_patient_clean/'+filelist[i])
        myVarList.append(createVar[filelist[i]]) # 这样，变量就在列表里了！
        for j in range(12):
            if myVarList[i][myVarList[i]['month'].isin([j+1])].empty:
                print('0000000000000000000000000000000000')
                continue
            print(j)
            tweets_df=myVarList[i][myVarList[i]['month'].isin([j+1])]
            print('aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa')
            print(tweets_df)
            print('bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb')
            #print(tweets_df)
            ### condcution
print(myVarList[0][myVarList[0]['month'].isin([5])])

0000000000000000000000000000000000
0000000000000000000000000000000000
0000000000000000000000000000000000
0000000000000000000000000000000000
4
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
       Unnamed: 0  Unnamed: 0.1                       Date             User  \
11570       11570       2302183  2020-05-31 00:37:33+00:00  SayitwithStace1   
11571       11571       2302251  2020-05-28 11:15:20+00:00  elliemarietweet   
11572       11572       2302259  2020-05-28 05:09:51+00:00    MadeAmericans   
11573       11573       2302300  2020-05-26 18:07:49+00:00        MalcolmOh   
11574       11574       2302366  2020-05-23 13:03:35+00:00  Caseshotpublish   
11575       11575       2302406  2020-05-21 19:06:33+00:00   IAMCHARMONTANA   
11576       11576       2302411  2020-05-21 12:45:43+00:00       Diddipops1   
11577       11577       2302414  2020-05-21 10:30:16+00:00        WyldeBoiy   
11578       11578       2302475  2020-05-19 02:12:03+00:00         LLLloyd1   
11579       11579       23