This notebook describes the cleanup process of the data.

In [4]:
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt
import re
import langdetect
import tqdm
import itertools as it
from operator import add

from wordcloud import WordCloud
from collections import defaultdict
from nltk.tokenize import sent_tokenize, word_tokenize

In [6]:
# read data
df = pd.read_csv(r"C:\Users\anees\Documents\Work\Twitter\nicotine_filter.csv", error_bad_lines=False, warn_bad_lines=False)

In [7]:
df.shape

(434769, 4)

Certain cells are empty or corrupted. We'll have to remove them.

In [8]:
df = df.dropna()
print(df.shape)
df = df[df['id'].map(len) == 19]
print(df['id'].is_unique, df.shape)
df = df[df['CreatedAt'].map(len) == 19] 
print(df.shape)
df = df[df['userId'].map(str.isnumeric) == True]
print(df.shape)

(418416, 4)
True (418206, 4)
(418202, 4)
(417139, 4)


Thus, 417139 tweets contain the keyword 'nicotine'. 

In [None]:
df.to_csv('cleaned.csv', index=False)

In [11]:
def remove_nonenglish(df):
    '''
        removes non-english tweets from the data
    '''
    en = 0
    flags = []
    for post in tqdm.tqdm(text): 
        try:
            ch = langdetect.detect(post)
            if ch == 'en':
                en +=1
                flags.append(True)
            else:
                flags.append(False)
        except Exception as e:
            if e == Exception.KeyboardInterrupt:
                break
            print(post)
            flags.append(False)
    return df[flags]

stopwords = set(nltk.corpus.stopwords.words('english'))
punctuation = (string.punctuation)
url_regex = re.compile(
    r'^(?:http|ftp)s?://' # http:// or https://
    r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
    r'localhost|' #localhost...
    r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
    r'(?::\d+)?' # optional port
    r'(?:/?|[/?]\S+)$', re.IGNORECASE)
tokenizer = nltk.tokenize.TweetTokenizer()
lemmatizer = nltk.WordNetLemmatizer()
punctuations = set(string.punctuation)

def lemmatizeToken(token, tag):
    '''
        Lemmatizer function. Four tags are considered: nouns, verbs, adjectives and adverbs
    '''
    tag = {
        'N': nltk.corpus.wordnet.NOUN,
        'V': nltk.corpus.wordnet.VERB,
        'R': nltk.corpus.wordnet.ADV,
        'J': nltk.corpus.wordnet.ADJ
    }.get(tag[0], nltk.corpus.wordnet.NOUN)
    return lemmatizer.lemmatize(token, tag)

def normalize_tweets(tweets,lemmatize):
    '''
        Creates one-grams and bi-grams from text data. 
        lemmatize=1 lemmatizes the data.
        Data is also normalized during the process. (removes hashtags, urls, stopwords
        , basic punctuation, non-printable characters)
        normalizes mentions
    '''
    onegrams = defaultdict(lambda:0)
    bigrams = defaultdict(lambda:0)
    onegram_dict = defaultdict(lambda:0)
    bigram_dict = defaultdict(lambda:0)
    loc_onegrams = []
    loc_bigrams = []
    prev = None
    for i in tqdm.trange(len(tweets)):
        tweet = tweets[i]
        lonegram = set()
        lbigram = set()
        tweet = tweet.encode('ascii', 'ignore').decode('ascii')
        for token, tag in nltk.pos_tag(tokenizer.tokenize(tweet.lower())):
            if token[0] == '#' or len(token) < 2 or url_regex.search(token):
                continue
            elif token in stopwords:
                continue
            elif all(char in punctuations for char in token):
                continue
            elif token[0] == '@':
                token = '@person' #Replace all friend tags with a common token.
                tag = 'NNP'
            word = lemmatizeToken(token, tag) if lemmatize else token
            onegrams[word] += 1
            lonegram.add(word)
            #add word location to onegram dictionary
            if word != "nicotine":
                if word not in onegram_dict:
                    onegram_dict[word] = [i]
                elif i != onegram_dict[word][-1]:
                    onegram_dict[word] += [i]
            if prev is not None:
                bigram = '-'.join([prev, word])
                bigrams[bigram] += 1
                lbigram.add(bigram)
                #add bigram location to bigram dictionary
                if bigram not in bigram_dict:
                    bigram_dict[bigram] = [i]
                elif i != bigram_dict[bigram][-1]:
                    bigram_dict[bigram] += [i]
                
            prev = word
        loc_onegrams.append(lonegram)
        loc_bigrams.append(lbigram)
        
    return onegrams,bigrams, onegram_dict, bigram_dict, loc_onegrams, loc_bigrams

def load_data(file_name, non_english=True):
    '''
        opens csv and runs normalize_tweets and generates onegrams and bigrams.
    '''
    df = pd.read_csv(file_name)
    if(non_english):
        df = remove_nonenglish(df)
    tweets = list(df['text'])
    print("Tweets count: ",len(tweets))
    return(normalize_tweets(tweets,1))

In the above function, we remove non-english tweets and also generate bigrams and onegrams for english tweets. By removing non-english tweets, we are left with 371642 tweets. 

In [12]:
onegrams, bigrams, onegram_dict, bigram_dict, loc_onegram, loc_bigram = load_data("english.csv", non_english=False)

Tweets count:  371642


100%|█████████████████████████████████████████████████████████████████████████| 371642/371642 [09:17<00:00, 667.17it/s]


Tweets containing the phrases "bad nicotine" ("worse than nicotine" is normalised to "bad nicotine"), "nicotine herion", and "nicotine stain" used in conjunction with "silver spoons" are references to song lyrics (Nicotine by Panic at the Disco, Never Be The Same by Camila Cabello, Nobody Home by Pink Floyd) and are not related to nicotine products. These tweets are removed. 

In [13]:
df = pd.read_csv('english.csv')
text = df['text']

In [14]:
def findWholeWord(w):
    return re.compile(r'\b({0})\b'.format(w), flags=re.IGNORECASE).search

In [20]:
remove_index = []

In [21]:
nicotine_stain_silver = []
count = 0
for i in bigram_dict['nicotine-stain']:
    sent = text[i]
    if(findWholeWord('silver spoon')(sent)):
        remove_index.append(i)
        count += 1
print(count)

68


In [22]:
print(bigrams['bad-nicotine'])
remove_index.extend(bigram_dict['bad-nicotine'])

3524


In [23]:
print(bigrams['nicotine-heroin'])
remove_index.extend(bigram_dict['nicotine-heroin'])

2744


In [24]:
print(bigrams['nicotine-heroine'])
remove_index.extend(bigram_dict['nicotine-heroine']) # misspelling of nicotine herion

939


In [27]:
remove_index = np.unique(remove_index)
print(len(remove_index))

7212


7212 tweets are removed.

In [28]:
df = df.drop(remove_index)

In [29]:
print(df.shape)

(364430, 4)


There are 364430 tweets left. We now have to separate bot and non-bot users.

In [30]:
users = df['userId']
unique_users = users.unique()
print(len(unique_users))
userdf = pd.DataFrame(unique_users)
userdf.to_csv('users.csv', index=False)

214514


There are 214514 unique users

The list of users generated above is used by the bot detection script that can be found as **bot_script.py** in the repository. 
**Note:** The script takes atleast a week to fully cover all users 

There are several users who have deleted their profiles on Twitter. We cannot determine now if they were bots or not. We'll have to drop their posts. 

In [1]:
import pickle

In [2]:
with open('combined_users.pickle', 'rb') as file:
    user_scores = pickle.load(file)

In [9]:
existing_users = []
count = 0
for user in tqdm.tqdm(unique_users):
    if user_scores[user] != None:
        existing_users.append(user)
    else:
        count += 1

100%|██████████████████████████████████████████████████████████████████████| 214514/214514 [00:00<00:00, 673494.10it/s]


In [10]:
print("Number of deleted users: " + str(count))

Number of deleted users: 27186


In [11]:
existing_set = set(existing_users)

In [15]:
# remove posts belonging to deleted users
existing_df_temp = []
count = 0
for row in tqdm.tqdm(df.itertuples()):
    if row.userId in existing_set:
        existing_df_temp += [row]
    else:
        count += 1

print("Number of posts removed: " + str(count))

364430it [00:01, 224932.21it/s]


Number of posts removed: 42890


42890 posts are removed because of deleted users

In [16]:
bot_df = []
real_df = []
bot_count = 0
for row in tqdm.tqdm(df.itertuples()):
    user = row.userId
    if user_scores[user] == None:
        continue;
    elif user_scores[user]['display_scores']['english'] >= 4:
        bot_count+=1
        bot_df += [row]
    else:
        real_df += [row]

364430it [00:01, 197139.45it/s]


In [17]:
real_df = pd.DataFrame(real_df)
bot_df = pd.DataFrame(bot_df)

In [18]:
print(real_df.shape[0])

300360


The number of non-bot tweets are 300360

In [19]:
print(bot_df.shape[0])

21180


The number of bot tweets are 21180

In [22]:
len(real_df.userId.unique())

181439

The number of non-bot users is 181439 

In [23]:
len(bot_df.userId.unique())

5889

The number of non-bot users is 5889

In [None]:
bot_df.to_csv('bots.csv', index=False)
real_df.to_csv('non_bots.csv', index=False)

The cleanup process is complete and the above two dataframes are used in the analysis.