In [51]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [56]:
df = pd.read_csv("data/processed/processed_data_30124.csv", index_col=0)

### 1. Split train and test

In [57]:
train_df, test_df = train_test_split(df, test_size = 0.2, random_state=123)

### 2. KMeans without text data

In [15]:
# try kmeans without text data
X_train = train_df.drop("processed_text", axis=1)

In [16]:
kmeans = KMeans(n_clusters=3, random_state=123)
kmeans.fit_transform(X_train)

array([[3.69649244e+01, 3.16218521e+05, 1.47357977e+05],
       [3.84078959e+01, 3.16218521e+05, 1.47357978e+05],
       [3.70506401e+01, 3.16219520e+05, 1.47358968e+05],
       ...,
       [3.96613186e+01, 3.16219520e+05, 1.47358969e+05],
       [3.86531063e+01, 3.16219520e+05, 1.47358969e+05],
       [3.75355958e+01, 3.16218521e+05, 1.47357977e+05]])

In [25]:
cluster_dict = defaultdict(list)

for i, tweet in enumerate(train_df['processed_text']):
    cluster_dict[kmeans.labels_[i]].append(tweet)

In [33]:
print(len(cluster_dict[0]), len(cluster_dict[1]), len(cluster_dict[2]))

24095 2 2


### 3. KMeans with text data

In [47]:
numeric_transformer = make_pipeline(SimpleImputer(strategy="median"),
                                    StandardScaler())

In [49]:
numeric_features = train_df.select_dtypes(include=np.number).columns.tolist()
text_feature = "processed_text"

In [53]:
## BOW + vectorize text data

bow_preprocessor = make_column_transformer(
    (numeric_transformer, numeric_features),
    (CountVectorizer(ngram_range=(1, 2), stop_words="english"), text_feature) # unigram and bigram
)
bow_preprocessor;

In [68]:
train_enc = bow_preprocessor.fit_transform(train_df)
vocab = bow_preprocessor.named_transformers_['countvectorizer'].vocabulary_
print(train_enc.shape)

(24099, 189158)


In [78]:
kmeans = KMeans(n_clusters=3, random_state=123)
kmeans.fit_transform(train_enc)

array([[ 5.10937753,  3.47898961, 94.24012244],
       [ 5.12707154,  3.39312831, 94.22918443],
       [ 6.14195072,  5.1984945 , 94.31815579],
       ...,
       [ 6.29044091,  6.38852335, 94.30720576],
       [ 5.60506117,  3.47051687, 94.28091021],
       [ 8.03081624,  8.29043079, 94.40271856]])

In [80]:
cluster_dict = defaultdict(list)

for i, tweet in enumerate(train_df['processed_text']):
    cluster_dict[kmeans.labels_[i]].append(tweet)

In [85]:
print(len(cluster_dict[0]), len(cluster_dict[1]), len(cluster_dict[2]))

9768 14327 4


In [87]:
print("GROUP_0: ", cluster_dict[0][2])
print("GROUP_1: ", cluster_dict[1][2])
print("GROUP_2: ", cluster_dict[2][2])

GROUP_0:  oh noooo my government be buy sinovac also i hate it it ’ s not a trust and test a the other vaccine and it ’ s more expensive obviously the choice wa political 😡
GROUP_1:  president of serbia aleksandar vučić say that the vaccination with the pfizer vaccine against coronavirus would start tomorrow
GROUP_2:  with covid case surge worse than ever get vaccinate be one of the most important thing we can do but until the vaccine be widely available socially distance and wear mask will actually save even more life and alleviate the pressure on healthcare worker
