# TWITTER ANALYSIS - MENTORSHIP PROJECT


#### STEP 1. CONFIGURATION: ESTABLISHING CONNECTION TO THE API
*Using Tweepy*

In [1]:
import tweepy 
import configparser
import requests     # For saving access tokens and for file management when creating and adding to the dataset
import os           # For dealing with json responses we receive from the API
import json         # For displaying the data after
import pandas as pd # For saving the response data in CSV format
import csv          # For parsing the dates received from twitter in readable formats
import datetime
import dateutil.parser
import unicodedata  #To add wait time between requests
import time
import sqlite3

In [2]:
#read configs
config = configparser.ConfigParser()
config.read('config.ini')

api_key             = config['twitter']['api_key']
api_key_secret      = config['twitter']['api_key_secret']

access_token        = config['twitter']['access_token']
access_token_secret = config['twitter']['access_token_secret']

bearer_token        = config['twitter']['bearer_token']

In [3]:
#Authenticate our account with the Twitter API
auth    = tweepy.OAuthHandler(api_key, api_key_secret)
auth.set_access_token(access_token, access_token_secret)
api     = tweepy.API(auth, wait_on_rate_limit=True)
    
# You can authenticate as your app with just your bearer token
client  = tweepy.Client(bearer_token=bearer_token)

# If the authentication was successful, this should print the
# screen name / username of the account
print(api.verify_credentials().screen_name)

KLuthra_


### STEP 2. DATA EXTRACTION & STORAGE
####  2.1. Defining Data Model Schemas for Tweet & User Data

In [4]:
# Set up SQLite database
conn = sqlite3.connect('twitter_data.db')
c = conn.cursor()

In [5]:
# Drop existing tables
# c.execute('DROP TABLE IF EXISTS tweets')
# c.execute('DROP TABLE IF EXISTS users')

In [6]:
# Create table for tweet data
c.execute('''CREATE TABLE IF NOT EXISTS tweets
             (tweet_id INTEGER PRIMARY KEY,
              author_id INTEGER,
              created_at TIMESTAMP,
              text TEXT,
              tweet_metrics JSON,
              entities JSON,
              context JSON,
              place_id JSON,
              FOREIGN KEY (author_id) REFERENCES users(author_id),
              FOREIGN KEY (place_id) REFERENCES users(place_id))''')

<sqlite3.Cursor at 0x207f5c32f80>

In [7]:
# c.execute("SELECT COUNT(*) FROM tweets")
# row_count = c.fetchone()[0]
# print(f"Number of rows in 'tweets' table: {row_count}")

In [8]:
# Create table for user data
c.execute('''CREATE TABLE IF NOT EXISTS users
             (author_id INTEGER PRIMARY KEY,
              username TEXT,
              verified TEXT,
              bio TEXT,
              author_created TIMESTAMP,
              author_location TEXT,
              followers_count INTEGER,
              following_count INTEGER,
              tweet_count INTEGER,
              entities JSON,
              FOREIGN KEY (author_id) REFERENCES tweets(author_id))''')

<sqlite3.Cursor at 0x207f5c32f80>

####  2.2. Defining a tweet fetching function using Tweepy

**__Pagination:__** Pagination is a feature in Twitter API v2 endpoints that return more results than can be returned in a single response. When that happens, the data is returned in a series of 'pages'. Pagination refers to methods for programatically requesting all of the pages, in order to retrieve the entire result data set. Not all API endpoints support or require pagination, but it is often used when result sets are large.

**Paginator** can be used to paginate for any Client methods that support pagination

In [9]:
def get_tweets(query, max_results):

  expansions    = ['author_id','in_reply_to_user_id','geo.place_id','entities.mentions.username','referenced_tweets.id','referenced_tweets.id.author_id']
  tweet_fields  = ['id','text','author_id','attachments','context_annotations','created_at','entities','lang','geo','public_metrics']
  user_fields   = ['id','name','username','created_at','description','entities','location','public_metrics','verified']
  place_fields  = ['full_name','id','country','country_code','geo','name','place_type']
  try:
    # call twitter api to fetch tweets
    fetched_tweets = tweepy.Paginator(client.search_recent_tweets, query=query,
      expansions        =expansions,
      tweet_fields      =tweet_fields,
      place_fields      =place_fields,
      user_fields       =user_fields,   
      max_results       =max_results
    ).flatten()
    
    return fetched_tweets
    

  except Exception as e:
    print("Error getting tweets", e)

####  2.2. Extracting Domains and Entities from the Twitter API
*Annotations have been added to the Tweet object from all v2 endpoints that return a Tweet object. Tweet annotations offer a way to understand contextual information about the Tweet itself. Though 100% of Tweets are reviewed, due to the contents of Tweet text, only a portion are annotated.*

##### **Tweet annotation types**
**Entities** Entity annotations are programmatically defined entities that are nested within the entities field and are reflected as annotations in the payload. Each annotation has a confidence score and an indication of where in the Tweet text the entities were identified (start and end fields).

The entity annotations can have the following types:

1. Person - Barack Obama, Daniel, or George W. Bush
2. Place - Detroit, Cali, or "San Francisco, California"
3. Product - Mountain Dew, Mozilla Firefox
4. Organization - Chicago White Sox, IBM
5. Other - Diabetes, Super Bowl 50

**Context annotations** are delivered as a context_annotations field in the payload. These annotations are inferred based on semantic analysis (keywords, hashtags, handles, etc) of the Tweet text and result in domain and/or entity labels. Context annotations can yield one or many domains. At present, we‚Äôre using a list of 80+ domains reflected in the table below.  
1. ID - 45: Brand Vertical
2. ID - 46: Brand Category
3. ID - 47: Brand
4. ID - 48: Product

##### 2.2.1 DOMAIN-ENTITY QUERY CONSTRUCTION 
The *search_recent_tweets* function within the Twitter API has a query limit of 512 characters. To work around this, I have created a list of strings, less than 512 characters long, which contain the domain_id.entity_id search query broken up into chunks of 512 characters or less each which I will iterate through when making API requests to retrieve tweets

In [10]:
domain_df = pd.read_csv('twitter-context-annotations/files/evergreen-context-entities-20220601.csv')
import itertools
def automate_domain_filter(df, start_id, end_id, chunk_size, domain_chunk_count):
    chunks_list = []
    for i in range(start_id, end_id+1):
        context_list = []
        mask = df['domains'].str.contains('^{}$'.format(i))
        filtered_df = df[mask]
        for index, row in filtered_df.iterrows():
            domain_id = row['domains']
            entity_id = row['entity_id']
            entity_name = row['entity_name']   
            # construct the query string
            context = f'context:{domain_id}.{entity_id}'
            context_list.append(context)
            context_query = ' OR '.join(context_list)
        code = context_query
        chunks = []
        start = 0
        counter = 0
        while start < len(code) and counter < domain_chunk_count[i]:
            end = start + chunk_size
            if end >= len(code):
                end = len(code)
            end = code.rfind(" OR ", start, end)
            if end == -1:
                end = start + chunk_size
            chunk = code[start:end]
            if chunk.startswith(" OR "):
                chunk = chunk[4:]
            chunks.append(chunk)
            start = end
            counter += 1
        chunks_list.append(chunks)
    return list(itertools.chain.from_iterable(chunks_list))

chunk_size = 350
domain_chunk_count = {45: 1, 46: 6, 47: 276, 48: 69}
chunks_list = automate_domain_filter(domain_df, 45, 48, chunk_size, domain_chunk_count)
# print(chunks_list)

##### 2.2.2 Defining Pre-Extraction Filtering 
- [ X ] Language restricted to English 
- [ X ] No Retweets or Quote Retweets Allowed
- [ X ] Filtering for tweets within Domains 45 through 48 (*all entities*)
- [ X ] Tweets must have mentions (*indicates presence of brand/sponsor*)
- [ X ] Hashtag List consisting of indications that the tweet is being promoted or sponsored
- [ ] Possible Entity Names which are irrelevant

##### 2.2.3 Defining Post-Extraction Filtering 
  1. Accounts that have a high ratio of followers to following (e.g., following fewer than 100 accounts but having thousands of followers)
  2. Number of Followers
  1. Accounts that use a large number of hashtags in their tweets (e.g., more than 5 hashtags per tweet).
  2. Accounts that use a lot of capital letters or exclamation points in their tweets.
  3. Accounts that have a high percentage of tweets that contain links (e.g., more than 50% of tweets contain links).   
  5. Using the Botometer API to extract a score for each user that indicates the probabibily of the account being a bot. 

In [11]:
def filter_rule(chunk, hash_include=True):
    text_list       = '(#ad OR #sponsored OR #promoted OR "Learn More" OR "Shop Now")'
    lang            = '(lang:en)'
    rt              = '(-is:retweet) (-"RT")' 
    domain            = chunk
    mention         = 'has:mentions'
    if hash_include == True:
        query           = text_list + ' ' + lang + ' ' + rt + ' ' + mention + ' ' + '(' + domain + ')'
    else: 
        query           = lang + ' ' + rt + ' ' + mention + ' ' + '(' + domain + ')'

    return query

In [12]:
import time
import tweepy
import pandas as pd

processed_tweets = 0
total_tweets = 0

tweet_data = []  # List to store tweet data

for chunk in chunks_list:
    print(f'Chunk: {chunk}')
    
    query = filter_rule(chunk=chunk,hash_include=True)
    paginator = get_tweets(query=query, max_results=100)

    if paginator is None:
        print('Error: Paginator is None. Skipping chunk.')
        continue
    
    for tweet in paginator:
        tweet_info = {
                'tweet_id': tweet.id,
                'author_id': tweet.author_id,
                'created_at': tweet.created_at,
                'text': tweet.text,
                'tweet_metrics': json.dumps(tweet.public_metrics),
                'entities': json.dumps(tweet.entities),
                'context': json.dumps(tweet.context_annotations),
                'place_id': json.dumps(tweet.geo) if tweet.geo else None
            }
            
        tweet_data.append(tweet_info)
        processed_tweets += 1
    
    
    print(f'Finished processing chunk: {chunk}')
    print(f'Progress: {processed_tweets} tweets processed.')
    time.sleep(3)  # Pause for 5 minutes between chunks to avoid hitting rate limits

Chunk: context:45.781972125171060736 OR context:45.781974597226799105 OR context:45.781974596740190208 OR context:45.781974596161376261 OR context:45.781974597474263040 OR context:45.781974597310615553 OR context:45.781974596157251587
Finished processing chunk: context:45.781972125171060736 OR context:45.781974597226799105 OR context:45.781974596740190208 OR context:45.781974596161376261 OR context:45.781974597474263040 OR context:45.781974597310615553 OR context:45.781974596157251587
Progress: 3605 tweets processed.
Chunk: context:46.872801340224806912 OR context:46.781974596144668673 OR context:46.781974596153057282 OR context:46.781974596715024385 OR context:46.781974596807299072 OR context:46.781972125179518977 OR context:46.781974596731871232 OR context:46.781974597222604800 OR context:46.781974596656304128 OR context:46.781974596710830081
Finished processing chunk: context:46.872801340224806912 OR context:46.781974596144668673 OR context:46.781974596153057282 OR context:46.781974

In [13]:
# Convert tweet data to a DataFrame
df = pd.DataFrame(tweet_data)
print(df.shape)

num_distinct_tweets = df['tweet_id'].nunique()
print(f"Number of distinct tweet IDs: {num_distinct_tweets}")

(20068, 8)
Number of distinct tweet IDs: 10011


In [16]:
# Convert tweet_data list to a DataFrame
df = pd.DataFrame(tweet_data)

# Sort the DataFrame by 'tweet_id' in descending order
df.sort_values('tweet_id', ascending=True, inplace=True)

# Drop duplicate rows based on 'tweet_id' column, keeping the last occurrence
dedup_df = df.drop_duplicates(subset='tweet_id', keep='last', inplace=False).reset_index(drop=True, inplace=False)

print(dedup_df.shape)

(10011, 8)


In [17]:
processed_tweets = 0

for index, tweet in dedup_df.iterrows():
    try:
        created_at = tweet['created_at'].strftime('%Y-%m-%d %H:%M:%S')
        # Check if tweet with the same tweet_id already exists in the database
        c.execute('SELECT tweet_id FROM tweets WHERE tweet_id=?', (tweet['tweet_id'],))
        existing_tweet_id = c.fetchone()

        if existing_tweet_id is None:
            # Tweet doesn't exist in the database, insert it
            c.execute('''INSERT INTO tweets 
                         (tweet_id, author_id, created_at, text, tweet_metrics, entities, context, place_id) 
                         VALUES (?, ?, ?, ?, ?, ?, ?, ?)''',
                      (tweet['tweet_id'], tweet['author_id'], created_at, tweet['text'],
                       tweet['tweet_metrics'], tweet['entities'], tweet['context'], tweet['place_id']))
            print(f"New Tweet Appended")
        else:
            # Tweet already exists, update tweet_metrics
            c.execute('''UPDATE tweets 
                         SET tweet_metrics = ? 
                         WHERE tweet_id = ?''',
                      (tweet['tweet_metrics'], tweet['tweet_id']))
            print(f"Tweet Already Exists, Updating Tweet Metrics")
        processed_tweets += 1
        
        print(f'Progress: {processed_tweets} tweets processed.')

    except Exception as e:
        print(f"Error inserting row: {tweet}")
        print(f"Error message: {e}")

# Commit the changes and close the connection
conn.commit()

Tweet Already Exists, Updating Tweet Metrics
Progress: 1 tweets processed.
Tweet Already Exists, Updating Tweet Metrics
Progress: 2 tweets processed.
Tweet Already Exists, Updating Tweet Metrics
Progress: 3 tweets processed.
Tweet Already Exists, Updating Tweet Metrics
Progress: 4 tweets processed.
Tweet Already Exists, Updating Tweet Metrics
Progress: 5 tweets processed.
Tweet Already Exists, Updating Tweet Metrics
Progress: 6 tweets processed.
Tweet Already Exists, Updating Tweet Metrics
Progress: 7 tweets processed.
Tweet Already Exists, Updating Tweet Metrics
Progress: 8 tweets processed.
Tweet Already Exists, Updating Tweet Metrics
Progress: 9 tweets processed.
Tweet Already Exists, Updating Tweet Metrics
Progress: 10 tweets processed.
Tweet Already Exists, Updating Tweet Metrics
Progress: 11 tweets processed.
Tweet Already Exists, Updating Tweet Metrics
Progress: 12 tweets processed.
Tweet Already Exists, Updating Tweet Metrics
Progress: 13 tweets processed.
Tweet Already Exists,

In [20]:
c.execute("SELECT COUNT(DISTINCT tweet_id) FROM tweets")
row_count = c.fetchone()[0]
print(f"Number of rows in 'tweets' table: {row_count}")

Number of rows in 'tweets' table: 21230


In [19]:
# Get unique author IDs from the tweets table
c.execute("SELECT DISTINCT author_id FROM tweets")
author_ids = [row[0] for row in c.fetchall()]

user_data = []
batch_size = 100
n = 0

# Iterate over batches of author IDs
for i in range(0, len(author_ids), batch_size):      
    # try:
    users = client.get_users(ids=author_ids[i:i+batch_size], user_fields=['id', 'name', 'username', 'created_at', 'description', 'entities', 'location', 'public_metrics', 'verified'])

    # Insert or update the user data in the database
    for user in users.data:
        # Check if author already exists in the database
        c.execute("SELECT author_id FROM users WHERE author_id=?", (user.id,))
        existing_author_id = c.fetchone()

        if existing_author_id is None:
            # Author doesn't exist in the database, insert a new row
            author_created = user.created_at.strftime('%Y-%m-%d %H:%M:%S')
            c.execute('''INSERT INTO users (author_id, username, verified, bio, author_created, author_location, 
                         followers_count, following_count, tweet_count, entities)
                         VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''',
                      (user.id, user.username, user.verified, user.description, author_created,
                       user.location, user.public_metrics['followers_count'], user.public_metrics['following_count'],
                       user.public_metrics['tweet_count'], json.dumps(user.entities)))
            print(f"Stored author: {user.name} (@{user.username}), id={user.id}")
        else:
            # Author already exists in the database, update the existing row
            author_created = user.created_at.strftime('%Y-%m-%d %H:%M:%S')
            c.execute('''UPDATE users SET username=?, verified=?, bio=?, author_created=?, author_location=?, 
                         followers_count=?, following_count=?, tweet_count=?, entities=?
                         WHERE author_id=?''',
                      (user.username, user.verified, user.description, author_created,
                       user.location, user.public_metrics['followers_count'], user.public_metrics['following_count'],
                       user.public_metrics['tweet_count'], json.dumps(user.entities), user.id))
            print(f"Updated author: {user.name} (@{user.username}), id={user.id}")

    time.sleep(16)


Stored author: Stripe Support (@stripesupport), id=564392197
Stored author: HPE Alliances (@HPE_Alliances), id=114881581
Stored author: Regilin Singh (@RegilinSingh), id=4110151066
Stored author: Coalition for Deep Space Exploration (CDSE) (@XploreDeepSpace), id=2176731084
Stored author: SciLifeLab_DataCentre (@SciLifeLab_DC), id=1383040819276492800
Stored author: Topeka Fire Department (@Topekafire), id=720279932610355201
Stored author: voterunlead (@VoteRunLead), id=2396505878
Stored author: AD Knowledge Portal (@AMPADPortal), id=868219215823511552
Stored author: FunkoPop Tracker (@FunkoTrack), id=1020556532
Stored author: SoftwareOne USA (@SoftwareOne_USA), id=1347031344
Stored author: Niagara-on-the-Lake (@Town_of_NOTL), id=320754569
Stored author: UNEP-WCMC (@unepwcmc), id=144993330
Stored author: Boomer (@BoomerExplains), id=1280281664892080128
Stored author: #SneakerScouts (@SneakerScouts), id=792182745942233089
Stored author: Roland Zigerli (@BigZiii), id=556701043
Stored autho

**DATA PRE-PROCESSING**

*Remove special characters
Whitespace
Hyperlinks
Stop words
TFIDF
Lemmatization*



In [21]:
import re
from functools import partial
from collections import Counter
import nltk
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

In [22]:
def removeUnicode(text):
    """ Removes unicode strings like "\u002c" and "x96" """
    text = re.sub(r'(\\u[0-9A-Fa-f]+)',r'', text)       
    text = re.sub(r'[^\x00-\x7f]',r'',text)
    return text

def replaceURL(text):
    """ Replaces url address with "url" """
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','url',text)
    text = re.sub(r'#([^\s]+)', r'\1', text)
    return text

def replaceAtUser(text):
    """ Replaces "@user" with "atUser" """
    text = re.sub('@[^\s]+','atUser',text)
    return text

def removeHashtagInFrontOfWord(text):
    """ Removes hastag in front of a word """
    text = re.sub(r'#([^\s]+)', r'\1', text)
    return text

def removeNumbers(text):
    """ Removes integers """
    text = ''.join([i for i in text if not i.isdigit()])         
    return text

def replaceMultiExclamationMark(text):
    """ Replaces repetitions of exlamation marks """
    text = re.sub(r"(\!)\1+", ' multiExclamation ', text)
    return text

def replaceMultiQuestionMark(text):
    """ Replaces repetitions of question marks """
    text = re.sub(r"(\?)\1+", ' multiQuestion ', text)
    return text

def replaceMultiStopMark(text):
    """ Replaces repetitions of stop marks """
    text = re.sub(r"(\.)\1+", ' multiStop ', text)
    return text

In [23]:
def countMultiExclamationMarks(text):
    """ Replaces repetitions of exlamation marks """
    return len(re.findall(r"(\!)\1+", text))

def countMultiQuestionMarks(text):
    """ Count repetitions of question marks """
    return len(re.findall(r"(\?)\1+", text))

def countMultiStopMarks(text):
    """ Count repetitions of stop marks """
    return len(re.findall(r"(\.)\1+", text))

def countElongated(text):
    """ Input: a text, Output: how many words are elongated """
    regex = re.compile(r"(.)\1{2}")
    return len([word for word in text.split() if regex.search(word)])

def countAllCaps(text):
    """ Input: a text, Output: how many words are all caps """
    return len(re.findall("[A-Z0-9]{3,}", text))

In [25]:
""" Creates a dictionary with slangs and their equivalents and replaces them """
with open('slang.txt') as file:
    slang_map = dict(map(str.strip, line.partition('\t')[::2])
    for line in file if line.strip())

slang_words = sorted(slang_map, key=len, reverse=True) # longest first for regex
regex = re.compile(r"\b({})\b".format("|".join(map(re.escape, slang_words))))
replaceSlang = partial(regex.sub, lambda m: slang_map[m.group(1)])

def countSlang(text):
    """ Input: a text, Output: how many slang words and a list of found slangs """
    slangCounter = 0
    slangsFound = []
    tokens = nltk.word_tokenize(text)
    for word in tokens:
        if word in slang_words:
            slangsFound.append(word)
            slangCounter += 1
    return slangCounter, slangsFound

""" Replaces contractions from a string to their equivalents """
contraction_patterns = [ (r'won\'t', 'will not'), (r'can\'t', 'cannot'), (r'i\'m', 'i am'), (r'ain\'t', 'is not'), (r'(\w+)\'ll', '\g<1> will'), (r'(\w+)n\'t', '\g<1> not'),
                         (r'(\w+)\'ve', '\g<1> have'), (r'(\w+)\'s', '\g<1> is'), (r'(\w+)\'re', '\g<1> are'), (r'(\w+)\'d', '\g<1> would'), (r'&', 'and'), (r'dammit', 'damn it'), (r'dont', 'do not'), (r'wont', 'will not') ]
def replaceContraction(text):
    patterns = [(re.compile(regex), repl) for (regex, repl) in contraction_patterns]
    for (pattern, repl) in patterns:
        (text, count) = re.subn(pattern, repl, text)
    return text

def replaceElongated(word):
    """ Replaces an elongated word with its basic form, unless the word exists in the lexicon """

    repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
    repl = r'\1\2\3'
    if wordnet.synsets(word):
        return word
    repl_word = repeat_regexp.sub(repl, word)
    if repl_word != word:      
        return replaceElongated(repl_word)
    else:       
        return repl_word


In [26]:
def removeEmoticons(text):
    """ Removes emoticons from text """
    text = re.sub(':\)|;\)|:-\)|\(-:|:-D|=D|:P|xD|X-p|\^\^|:-*|\^\.\^|\^\-\^|\^\_\^|\,-\)|\)-:|:\'\(|:\(|:-\(|:\S|T\.T|\.\_\.|:<|:-\S|:-<|\*\-\*|:O|=O|=\-O|O\.o|XO|O\_O|:-\@|=/|:/|X\-\(|>\.<|>=\(|D:', '', text)
    return text

def countEmoticons(text):
    """ Input: a text, Output: how many emoticons """
    return len(re.findall(':\)|;\)|:-\)|\(-:|:-D|=D|:P|xD|X-p|\^\^|:-*|\^\.\^|\^\-\^|\^\_\^|\,-\)|\)-:|:\'\(|:\(|:-\(|:\S|T\.T|\.\_\.|:<|:-\S|:-<|\*\-\*|:O|=O|=\-O|O\.o|XO|O\_O|:-\@|=/|:/|X\-\(|>\.<|>=\(|D:', text))


In [28]:
## Spell Correction begin ###
""" Spell Correction http://norvig.com/spell-correct.html """
def words(text): return re.findall(r'\w+', text.lower())

WORDS = Counter(words(open('corporaForSpellCorrection.txt').read()))

def P(word, N=sum(WORDS.values())): 
    """P robability of `word`. """
    return WORDS[word] / N

def spellCorrection(word): 
    """ Most probable spelling correction for word. """
    return max(candidates(word), key=P)

def candidates(word): 
    """ Generate possible spelling corrections for word. """
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
    """ The subset of `words` that appear in the dictionary of WORDS. """
    return set(w for w in words if w in WORDS)

def edits1(word):
    """ All edits that are one edit away from `word`. """
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    """ All edits that are two edits away from `word`. """
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

### Spell Correction End ###

In [29]:
#
def addNotTag(text):
	""" Finds "not,never,no" and adds the tag NEG_ to all words that follow until the next punctuation """
	transformed = re.sub(r'\b(?:not|never|no)\b[\w\s]+[^\w\s]', 
       lambda match: re.sub(r'(\s+)(\w+)', r'\1NEG_\2', match.group(0)), 
       text,
       flags=re.IGNORECASE)
	return transformed

def addCapTag(word):
    """ Finds a word with at least 3 characters capitalized and adds the tag ALL_CAPS_ """
    if(len(re.findall("[A-Z]{3,}", word))):
        word = word.replace('\\', '' )
        transformed = re.sub("[A-Z]{3,}", "ALL_CAPS_"+word, word)
        return transformed
    else:
        return word

In [33]:
import string
import nltk
nltk.download('stopwords')

""" Tokenizes a text to its words, removes and replaces some of them """    
finalTokens = [] # all tokens
stoplist = stopwords.words('english')
my_stopwords = "multiexclamation multiquestion multistop url atuser st rd nd th am pm" # my extra stopwords
stoplist = stoplist + my_stopwords.split()
allowedWordTypes = ["J","R","V","N"] #  J is Adject, R is Adverb, V is Verb, N is Noun. These are used for POS Tagging
lemmatizer = WordNetLemmatizer() # set lemmatizer
stemmer = PorterStemmer() # set stemmer

# Create a DataFrame to store the preprocessed text and tweet ID
preprocessed_data = pd.DataFrame(columns=['tweet_id', 'text'])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kushl\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [34]:
def tokenize(text, wordCountBefore, tweet_id):
    totalAdjectives = 0
    totalAdverbs = 0
    totalVerbs = 0
    onlyOneSentenceTokens = []  # tokens of one sentence each time

    tokens = nltk.word_tokenize(text)

    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)  # Technique 7: remove punctuation

    tokens = nltk.word_tokenize(text)

    tagged = nltk.pos_tag(tokens)  # Technique 13: part of speech tagging
    for w in tagged:
        if w[1][0] in allowedWordTypes and w[0] not in stoplist:
            final_word = addCapTag(w[0])
            final_word = replaceElongated(final_word)
            if len(final_word) > 1:
                final_word = spellCorrection(final_word)
            final_word = lemmatizer.lemmatize(final_word)
            final_word = stemmer.stem(final_word)

            onlyOneSentenceTokens.append(final_word)
            finalTokens.append(final_word)

    onlyOneSentence = " ".join(onlyOneSentenceTokens)

    # Store the preprocessed text and tweet ID in the DataFrame
    preprocessed_data.loc[len(preprocessed_data)] = [tweet_id, onlyOneSentence]

    return finalTokens


In [40]:
from time import time
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# Retrieve all tweets from the database
query = "SELECT DISTINCT * FROM tweets"
df_tweets = pd.read_sql_query(query, conn)
#Tweet_id is column 0, text is column 3


t0 = time()
totalSentences = 0
totalEmoticons = 0
totalSlangs = 0
totalSlangsFound = []
totalElongated = 0
totalMultiExclamationMarks = 0
totalMultiQuestionMarks = 0
totalMultiStopMarks = 0
totalAllCaps = 0

# Iterate over each row in df_tweets
for index, row in df_tweets.iterrows():
    totalSentences += 1
    feat = []

    tweet_id = row['tweet_id']
    text     = removeUnicode(row['text'])  # Technique 0

    wordCountBefore = len(re.findall(r'\w+', text))  # word count of one sentence before preprocess

    text = replaceURL(text)  # Technique 1
    text = replaceAtUser(text)  # Technique 1
    text = removeHashtagInFrontOfWord(text)  # Technique 1

    temp_slangs, temp_slangsFound = countSlang(text)
    totalSlangs += temp_slangs  # total slangs for all sentences
    for word in temp_slangsFound:
        totalSlangsFound.append(word)  # all the slangs found in all sentences

    text = replaceSlang(text)  # Technique 2: replaces slang words and abbreviations with their equivalents
    text = replaceContraction(text)  # Technique 3: replaces contractions to their equivalents
    text = removeNumbers(text)  # Technique 4: remove integers from text

    emoticons = countEmoticons(text)  # how many emoticons in this sentence
    totalEmoticons += emoticons

    text = removeEmoticons(text)  # removes emoticons from text

    totalAllCaps += countAllCaps(text)

    totalMultiExclamationMarks += countMultiExclamationMarks(text)  # how many repetitions of exclamation marks in this sentence
    totalMultiQuestionMarks += countMultiQuestionMarks(text)  # how many repetitions of question marks in this sentence
    totalMultiStopMarks += countMultiStopMarks(text)  # how many repetitions of stop marks in this sentence

    text = replaceMultiExclamationMark(text)  # Technique 5: replaces repetitions of exclamation marks with the tag "multiExclamation"
    text = replaceMultiQuestionMark(text)  # Technique 5: replaces repetitions of question marks with the tag "multiQuestion"
    text = replaceMultiStopMark(text)  # Technique 5: replaces repetitions of stop marks with the tag "multiStop"

    totalElongated += countElongated(text)  # how many elongated words emoticons in this sentence

    tokens = tokenize(text, wordCountBefore, tweet_id)  

# View the resulting preprocessed data
print(preprocessed_data.head())
    
print("Total sentences: ",totalSentences,"\n")
print("Total Words before preprocess: ",len(re.findall(r'\w+', f)))
print("Total Distinct Tokens before preprocess: ",len(set(re.findall(r'\w+', f))))
print("Average word/sentence before preprocess: ",len(re.findall(r'\w+', f))/totalSentences,"\n")
print("Total Words after preprocess: ",len(tokens))
print("Total Distinct Tokens after preprocess: ",len(set(tokens)))
print("Average word/sentence after preprocess: ",len(tokens)/totalSentences,"\n")


print("Total run time: ",time() - t0," seconds\n")

print("Total emoticons: ",totalEmoticons,"\n")
print("Total slangs: ",totalSlangs,"\n")
commonSlangs = nltk.FreqDist(totalSlangsFound)
for (word, count) in commonSlangs.most_common(20): # most common slangs across all texts
    print(word,"\t",count)

commonSlangs.plot(20, cumulative=False) # plot most common slangs

print("Total elongated words: ",totalElongated,"\n")
print("Total multi exclamation marks: ",totalMultiExclamationMarks)
print("Total multi question marks: ",totalMultiQuestionMarks)
print("Total multi stop marks: ",totalMultiStopMarks,"\n")
print("Total all capitalized words: ",totalAllCaps,"\n")

#print(tokens)
commonWords = nltk.FreqDist(tokens)
print("Most common words ")
print("Word\tCount")
for (word, count) in commonWords.most_common(100): # most common words across all texts
    print(word,"\t",count)

commonWords.plot(100, cumulative=False) # plot most common words


bgm = nltk.collocations.BigramAssocMeasures()
tgm = nltk.collocations.TrigramAssocMeasures()
bgm_finder = nltk.collocations.BigramCollocationFinder.from_words(tokens)
tgm_finder = nltk.collocations.TrigramCollocationFinder.from_words(tokens)
bgm_finder.apply_freq_filter(5) # bigrams that occur at least 5 times
print("Most common collocations (bigrams)")
print(bgm_finder.nbest(bgm.pmi, 50)) # top 50 bigram collocations
tgm_finder.apply_freq_filter(5) # trigrams that occur at least 5 times
print("Most common collocations (trigrams)")
print(tgm_finder.nbest(tgm.pmi, 20)) # top 20 trigrams collocations

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kushl\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\kushl\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kushl\AppData\Roaming\nltk_data...


KeyboardInterrupt: 