In [1]:
import numpy as np
import pandas as pd
import json

from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

In [23]:
path = "tweets/tweets/5g-corona/5g_corona_conspiracy.json"
corona_data = json.load(open(path))
path = "tweets/tweets/non-conspiracy/non_conspiracy.json"
non_data = json.load(open(path))
path = "tweets/tweets/Other-Conspiracy/other_conspiracy.json"
other_data = json.load(open(path))

In [24]:
corona_texts = np.array([corona_data[i]['full_text'] for i in range(len(corona_data))])
non_texts = np.array([non_data[i]['full_text'] for i in range(len(non_data))])
other_texts = np.array([other_data[i]['full_text'] for i in range(len(other_data))])

In [106]:
count = 0
for text in other_texts:
    if 'http' in text:
        count += 1
print(count/len(other_texts))

0.4957142857142857


In [25]:
corona_df = pd.DataFrame(corona_texts, columns= [ 'body' ])
non_df = pd.DataFrame(non_texts, columns= [ 'body' ])
other_df = pd.DataFrame(other_texts, columns= [ 'body' ])

In [7]:
df.head(5)

Unnamed: 0,body
0,This is mad 5g poles just popping up everywher...
1,Former Vodafone Boss Blows Whistle on 5G Coron...
2,Okay so if the 5G theory isn’t true or have so...
3,@DB1_2 @JayMcCluskey74 @NotArsedLike2 I can't ...
4,Conspiracy theories online suggest #5G #techno...


In [81]:
def preprocess(df):
    df_copy = df.copy()
    df_copy["body"] = df_copy["body"].str.lower()
    df_copy["body"] = df_copy.apply(lambda row: word_tokenize(row['body']), axis = 1)
    df_copy["body"] = df_copy.apply(lambda row: [word for word in row["body"] if (word.isalpha() or any(map(str.isdigit, word)))], axis = 1)
    return df_copy

In [60]:
def lemmatize(df):
    df_copy = df.copy()
    lemmatizer = WordNetLemmatizer()
    df_copy["body"] = df_copy.apply(lambda row: [lemmatizer.lemmatize(word) for word in row["body"]], axis = 1)
    return df_copy

https://www.machinelearningplus.com/nlp/lemmatization-examples-python/?fbclid=IwAR2qZjnIsSyY_3OGBPg_lovC9o0uKWM8dGq8Ne22bMVHxG8E7OE8SPAyNxE

In [10]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [11]:
def lemmatize_pos(df):
    df_copy = df.copy()
    lemmatizer = WordNetLemmatizer()
    df_copy["content"] = df_copy.apply(lambda row: [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in row["content"]], axis = 1)
    return df_copy

In [12]:
def tfidf_vector(df):
    df_copy = df.copy()
    tfidf = TfidfVectorizer(analyzer='word', tokenizer=lambda _: _, preprocessor=lambda _: _, token_pattern=None, stop_words='english')
    tvec_weights = tfidf.fit_transform(df_copy["body"])
    # print(tvec_weights.toarray())
    weights = np.asarray(tvec_weights.mean(axis=0)).ravel().tolist()
    weights_df = pd.DataFrame({'term': tfidf.get_feature_names(), 'weight': weights})
    weights_sorted = weights_df.sort_values(by = 'weight', ascending = False)
    print(weights_sorted.head(10))
    return tvec_weights
    

In [82]:
corona_content = corona_df.pipe(preprocess).pipe(lemmatize)
non_content = non_df.pipe(preprocess).pipe(lemmatize)
other_content = other_df.pipe(preprocess).pipe(lemmatize)

In [None]:
corona_content

In [62]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))


In [84]:
from nltk.probability import FreqDist
fdist = FreqDist()
for tweets_token in corona_content['body'].to_numpy():
    for word in tweets_token:
        if word not in stop_words:
            fdist[word.lower()]+= 1

In [87]:
corona_buzz_words = ['coronavirus', 'virus', 'wuhan', 'corona', 'pandemic', 'covid', 'symptom', 'vaccine', 'flu', 'covid19', 'covid-19']
g5_buzz_words = ['network', 'radiation', '5g', '4g', 'tower']

In [94]:
count = 0
thres = 2
data = non_content['body'].to_numpy()
for x in data:
    g5_count = 0
    corona_count = 0
    for word in x:
        if word in corona_buzz_words:
            corona_count += 1
        if word in g5_buzz_words:
            g5_count += 1
        if corona_count >= thres and g5_count >= thres:
            count += 1
            break
print(count/len(data))

0.06398274622573688


In [17]:
np.mean([len(x) for x in df_content['body'].to_numpy()])

32.79823008849557

In [18]:
vectors = df_content.pipe(tfidf_vector).toarray()

             term    weight
2022         http  0.042232
892   coronavirus  0.037450
4526        virus  0.026956
3099       people  0.024960
4549           wa  0.022282
4704        wuhan  0.021317
886        corona  0.021009
4288        tower  0.019724
3632            s  0.019110
169           amp  0.018913


  'stop_words.' % sorted(inconsistent))


In [22]:
len(vectors[1])

4758

In [17]:
import pyclustertend as pct
pct.hopkins(vectors, )

0.09590129288870966

In [51]:
from sklearn import metrics
from sklearn.cluster import KMeans

summary = []
for i in range(50, 100):
    labels = KMeans(n_clusters=i, random_state=None).fit_predict(vectors)
    result = metrics.silhouette_score(vectors, labels)
    summary.append(result)
    print(i)


50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [52]:
l = np.asarray(summary)
print(l.argmax())

11


In [53]:
summary

[0.035689733224199875,
 0.03016618150274908,
 0.03231839073518291,
 0.0342456455092172,
 0.01085831572895175,
 0.033461297892968915,
 -0.0010061637525390532,
 0.026105963608349783,
 0.023765732232693796,
 -0.014092954526579191,
 -0.010043854324797315,
 0.03716178956304722,
 0.022845862199821085,
 0.03176113781413042,
 0.027968472033004333,
 0.01983264412247695,
 0.035113104771028344,
 0.036027582821125975,
 0.033409800108821004,
 -0.016312964235137055,
 -0.008644114852455654,
 -0.01497617070300697,
 0.025027930951929213,
 -0.012033584717626732,
 -0.013634612663815814,
 -0.015795155536321343,
 -0.012964892707546414,
 -0.011767787735382437,
 -0.0056388472959391905,
 -0.017386388310902147,
 -0.008033269639290049,
 -0.021976473341743418,
 -0.013607819864435494,
 -0.01635261525803282,
 -0.015432414623469443,
 -0.0103653122350058,
 0.036551525132938835,
 0.029752526331921103,
 0.02768280735452487,
 0.007764548033019912,
 -0.014112950025018054,
 -0.00827948204194722,
 -0.009893352698343457,
 