In [2]:
import re
import json
import pandas as pd
import numpy as np
from pymongo import MongoClient
from pprint import pprint
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans, DBSCAN

In [4]:
# Log into MongoDB
client = MongoClient('localhost')
db = client.twitter_db

# Read all tweets into DataFrame
cursor = db.twitter_posts.find()
df = pd.DataFrame(list(cursor))

# Delete ID column
del df['_id']
df.describe()

  df.describe()


Unnamed: 0,tweet_text,username,created_at
count,303,303,303
unique,303,34,293
top,Make your mark – the new GLE Coupé is just mad...,DollarShaveClub,2020-10-15 18:34:26
freq,1,10,3
first,,,2020-10-08 07:26:26
last,,,2020-10-16 10:16:44


In [5]:
word_lemm = WordNetLemmatizer()

# Tweet preprocessing
def preprocess_texts(text_list: pd.DataFrame):

    # Lowercase the tweets
    text_list['processed_tweet'] = text_list['tweet_text'].str.lower()

    # Regex patterns
    url_pattern        = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
    user_pattern       = '@[^\s]+'
    alpha_pattern      = "[^a-zA-Z0-9]"
    sequence_pattern   = r"(.)\1\1+"
    seq_replace_pattern = r"\1\1"

    # Remove URLs from the tweet text
    text_list['processed_tweet'] = [re.sub(url_pattern, ' ', str(x))
                                    for x in text_list['processed_tweet']]
    # Remove username from the tweet text
    text_list['processed_tweet'] = [re.sub(user_pattern, ' ', str(x))
                                    for x in text_list['processed_tweet']]
    # Remove all non-alphanumeric symbols
    text_list['processed_tweet'] = [re.sub(alpha_pattern, ' ', str(x))
                                    for x in text_list['processed_tweet']]
    # Replace all 3 or more consecutive letters with 2 letters
    text_list['processed_tweet'] = [re.sub(sequence_pattern, seq_replace_pattern, str(x))
                                    for x in text_list['processed_tweet']]

    full_tweet = ''
    full_tweet_list = []
    for x in text_list['processed_tweet']:
        for word in x.split():
            if word not in stopwords.words('english'):
                if len(word) > 1:
                    word = word_lemm.lemmatize(word)
                    full_tweet += (word + ' ')
        full_tweet_list.append(full_tweet)

    text_list['processed_tweet'] = full_tweet_list

    return text_list

In [6]:
import time

df_processed = preprocess_texts(df)

# TF-IDF Vectorization - CountVectorize (Bag of Words), and then apply IF-IDF Transformer
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=800000)
start = time.time()
vectorizer.fit(df_processed['processed_tweet'])
print(f'Vectorizer fitting ended in {round(time.time()-start)} seconds')
print(f'Number of feature_words: {len(vectorizer.get_feature_names())}')

Vectorizer fitting ended in 1 seconds
Number of feature_words: 5245


In [5]:
transformed_tweets = vectorizer.transform(df_processed['processed_tweet'])

kmpp_model = KMeans(n_clusters=10, init='k-means++', n_init=1, max_iter=1000, tol=1e-5,
                    random_state=2200)
kmpp_model.fit(transformed_tweets)

def show_training_results(model, n_clusters):
    order_centroids = model.cluster_centers_.argsort()[:,::-1]
    terms = vectorizer.get_feature_names()

    for i in range(n_clusters):
        print("Cluster %d:" % i)
        for ind in order_centroids[i, :20]:
            print(' %s' % terms[ind])

show_training_results(kmpp_model, 10)

Cluster 0:
 chanel
 dm
 team
 look
 please
 thanks
 louisvuitton
 new
 plastic
 ikea
 collection
 latest
 time
 experience
 like
 learn
 sorry
 help
 year
 show
Cluster 1:
 team
 please
 number
 one
 year
 watch
 longwayup
 hey
 hear
 800
 help
 customer
 call
 change
 climate
 hello
 hi
 check
 happy
 see
Cluster 2:
 chanel
 dm
 team
 ikea
 please
 experience
 help
 sorry
 new
 thanks
 learn
 hear
 year
 look
 gabrielle
 gabrielle chanel
 design
 send
 life
 like
Cluster 3:
 chanel
 dm
 team
 thanks
 max
 new
 louisvuitton
 look
 please
 je
 collection
 plastic
 like
 adobe
 adobe max
 latest
 ikea
 hear
 time
 help
Cluster 4:
 team
 dyson
 year
 disneyplus
 dm
 please
 great
 number
 first
 change
 hi
 real
 streaming
 look
 thanks
 hey
 new
 know
 available
 20
Cluster 5:
 team
 climate
 event
 change
 greenest
 climate change
 framework
 signing
 year
 youth
 activist
 event watch
 activist latest
 youth climate
 live
 latest
 live event
 video
 video event
 raceagainstclimatechang

In [3]:
# Load new dataset - for final model
new_dataset = pd.read_csv('data/trainingandtestdata/training.1600000.processed.noemoticon.csv',  
                          header=None,
                          parse_dates=True,
                          encoding='ISO-8859-1',
                          names=['polarity', 'id', 'date', 'query', 'user', 'text']
                          )
new_dataset.head()

Unnamed: 0,polarity,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [4]:
word_lemm_new = WordNetLemmatizer()
nltk_stopwords = stopwords.words('english')

# Tweet preprocessing
def preprocess_texts_new(text_list: list):

    # Lowercase the tweets
    tweets = text_list.str.lower()

    # Regex patterns
    url_pattern        = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
    user_pattern       = '@[^\s]+'
    alpha_pattern      = "[^a-zA-Z0-9]"
    sequence_pattern   = r"(.)\1\1+"
    seq_replace_pattern = r"\1\1"

    # Remove URLs from the tweet text
    tweets = [re.sub(url_pattern, ' ', str(x))
                                    for x in tweets]
    # Remove username from the tweet text
    tweets = [re.sub(user_pattern, ' ', str(x))
                                    for x in tweets]
    # Remove all non-alphanumeric symbols
    tweets = [re.sub(alpha_pattern, ' ', str(x))
                                    for x in tweets]
    # Replace all 3 or more consecutive letters with 2 letters
    tweets = [re.sub(sequence_pattern, seq_replace_pattern, str(x))
                                    for x in tweets]

    full_tweet = ''
    full_tweet_list = []
    for x in tweets:
        for word in x.split():
            if word not in nltk_stopwords:
                if len(word) > 1:
                    word = word_lemm_new.lemmatize(word)
                    full_tweet += (word + ' ')
        full_tweet_list.append(full_tweet)

    return full_text_list

In [None]:
import time

processed_texts = preprocess_texts_new(new_dataset['text'])

# TF-IDF Vectorization - CountVectorize (Bag of Words), and then apply IF-IDF Transformer
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=500000)
start = time.time()
vectorizer.fit(processed_texts)
print(f'Vectorizer fitting ended in {round(time.time()-start)} seconds')
print(f'Number of feature_words: {len(vectorizer.get_feature_names())}')

In [None]:
transformed_tweets = vectorizer.transform(df_processed['processed_tweet'])

kmpp_model = KMeans(n_clusters=10, init='k-means++', n_init=1, max_iter=1000, tol=1e-5,
                    random_state=2200)
kmpp_model.fit(transformed_tweets)

def show_training_results(model, n_clusters):
    order_centroids = model.cluster_centers_.argsort()[:,::-1]
    terms = vectorizer.get_feature_names()

    for i in range(n_clusters):
        print("Cluster %d:" % i)
        for ind in order_centroids[i, :20]:
            print(' %s' % terms[ind])

show_training_results(kmpp_model, 10)