# NLP learning for bot (conservative twitter bot)

In [14]:
import snscrape.modules.twitter as sntwitter
import pandas as pd
import pickle
import random
import seaborn as cb
import json
import numpy as np
import nltk 
from mosestokenizer import MosesDetokenizer 
from nltk import ngrams
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
import re
import matplotlib.pyplot as plt
import itertools
import collections

## Data scrapping

In [2]:
tweets_con = []

#scrape data and append tweets to list
for i, tweet in enumerate(sntwitter.TwitterSearchScraper('#conservative').get_items()): # declare a username
  if i>50000: #number of tweets you want to scrape
    break
  tweets_con.append([tweet.content]) # declare the attributes to be returned

# creating a dataframe from the list
tweets_con_df = pd.DataFrame(tweets_con, columns=['Text'])

  tweets_con.append([tweet.content]) # declare the attributes to be returned
Tweet 1575223201269006336 contains an app icon medium key '4_1594606217359462401' on app 'iphone_app'/'1142951331', but the corresponding medium is missing; dropping
Tweet 1575223201269006336 contains an app icon medium key '4_1594606217359462401' on app 'ipad_app'/'1142951331', but the corresponding medium is missing; dropping


### Creating json of tweets for preservation

In [3]:
file = open("tweet_con.json",'w')
j = json.dumps(tweets_con, default = str)
file.write(j)
file.close()

### Clean up

In [4]:
def clean_text(txt):
    cleaned = re.sub(r"http\S+","", txt)
    return cleaned

In [5]:
cleaned_text = [clean_text(elements) for elements in tweets_con_df.Text]
tt = TweetTokenizer()
normalize = [tt.tokenize(elements.lower()) for elements in cleaned_text]

In [6]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\djona\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
#stop_words = set(stopwords.words('english'))
#stop_words.add('#conservative')

# remove stop words and punctuation
#complete = [[w for w in words if not w in stop_words if w.isalnum()] for words in normalize]
complete = [[w for w in words if w.isalnum()] for words in normalize]

### Ngram splitting

In [31]:
trigram = [list(ngrams(words,3)) for words in complete]
bigram = [list(ngrams(words,2)) for words in complete]
quadgram = [list(ngrams(words,4)) for words in complete]
trigramC = list(itertools.chain(*trigram))
bigramC = list(itertools.chain(*bigram))
quadgramC = list(itertools.chain(*quadgram))
tokenC = list(itertools.chain(*complete))
token_counter = collections.Counter(tokenC)
trigram_counter = collections.Counter(trigramC)
bigram_counter = collections.Counter(bigramC)
quadgram_counter = collections.Counter(quadgramC)
trigram_total = sum(trigram_counter.values())
bigram_total = sum(bigram_counter.values())
quadgram_total = sum(quadgram_counter.values())

In [None]:
bigram_rel_freq = {k: (v / bigram_total)+1 for (k,v) in bigram_counter.items()}
with open("models/bigram_tweet.pkl","wb") as outfile:
    pickle.dump(bigram_rel_freq, outfile)

In [33]:
trigram_rel_freq = {k: (v / trigram_total)+1 for (k,v) in trigram_counter.items()}
with open("models/trigram_tweet.pkl","wb") as outfile:
    pickle.dump(trigram_rel_freq, outfile)

In [None]:
quadgram_rel_freq = {k: (v / quadgram_total)+1 for (k,v) in quadgram_counter.items()}
with open("models/quadgram_tweet.pkl","wb") as outfile:
    pickle.dump(quadgram_rel_freq, outfile)

### Generator based on model

In [35]:
def ngram_gen(seed, num):
    if(num == 2):
        with open("models/bigram_tweet.pkl", "rb") as infile:
            bigrams = pickle.load(infile)
        bgms = {k: v for (k,v) in bigrams.items() if k[:2] == seed}
        wds = [e[2] for e in bgms.keys()]
        if wds:
            weights = [float(e) for e in bgms.values()]
            return random.choices(population=wds,weights=weights)[0]
        else:
            w = random.choice(list(token_counter.keys()))
            return w
    if(num == 3):
        with open("models/trigram_tweet.pkl", "rb") as infile:
            trigrams = pickle.load(infile)
        tgms = {k: v for (k,v) in trigrams.items() if k[:2] == seed}
        wds = [e[2] for e in tgms.keys()]
        if wds:
            weights = [float(e) for e in tgms.values()]
            return random.choices(population=wds,weights=weights)[0]
        else:
            w = random.choice(list(token_counter.keys()))
            return w
    if(num == 4):
        with open("models/quadgram_tweet.pkl", "rb") as infile:
            quadgrams = pickle.load(infile)
        qgms = {k: v for (k,v) in quadgrams.items() if k[:2] == seed}
        wds = [e[2] for e in qgms.keys()]
        if wds:
            weights = [float(e) for e in qgms.values()]
            return random.choices(population=wds,weights=weights)[0]
        else:
            w = random.choice(list(token_counter.keys()))
            return w


In [36]:
line = list(random.choice(list(trigram_rel_freq.keys())))
for i in range(12):
    w = ngram_gen((line[-2], line[-1]),3)
    line.append(w)
