# Generate random data

## Random numeric data

In [9]:
import numpy as np
np.random.seed(90)
numeric_data = np.random.randn(50, 2)

## Random text data

In [10]:
import pandas as pd

text_data = pd.DataFrame({
    "text_col": [
        "This TEXT needs \t\t\tsome cleaning!!!...", 
        "This text too!!...       ", 
        "Yes, you got it right!\n This one too\n"
    ]
})

# Example of KMeans Clustering

In [1]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans

# Generate example data
np.random.seed(90)
original_data = np.random.randn(50, 2)

# Kmeans model
kmeans = KMeans(n_clusters=3)

# Fit the moodel with randomed data
kmeans.fit(original_data)

# Predict cluster assignment for original data
cluster_assignment = kmeans.predict(original_data)

# map cluster to original data
mapped_data = np.column_stack((original_data, cluster_assignment))
# mapped_data = pd.DataFrame({'x': original_data[0], 'y': original_data[1], 'cluster': cluster_assignment})

print(type(original_data[0]))
# print(mapped_data)


<class 'numpy.ndarray'>


# Example of Text Vectorizer using Bag-of-words approach

In [11]:
# Vectorizing text using Bag-of-word method with CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
bow_representation = vectorizer.fit_transform(text_data['text_col'])
# bow_representation
print(vectorizer.get_feature_names_out())
print(bow_representation.toarray())
print(type(bow_representation))

['cleaning' 'got' 'it' 'needs' 'one' 'right' 'some' 'text' 'this' 'too'
 'yes' 'you']
[[1 0 0 1 0 0 1 1 1 0 0 0]
 [0 0 0 0 0 0 0 1 1 1 0 0]
 [0 1 1 0 1 1 0 0 1 1 1 1]]
<class 'scipy.sparse.csr.csr_matrix'>


# Latent Dirichlet Allocation Example

In [12]:
# Latent Dirichlet Allocation example
from sklearn.decomposition import LatentDirichletAllocation
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

tokenizer = RegexpTokenizer(r'\w+')
tfidf = TfidfVectorizer(lowercase=True,
                        stop_words='english',
                        tokenizer = tokenizer.tokenize)

train_data = tfidf.fit_transform(text_data['text_col'])

In [13]:
lda = LatentDirichletAllocation(n_components=5)
lda_matrix = lda.fit_transform(train_data)

# Get Components 
lda_matrix

array([[0.07365551, 0.07358351, 0.07662975, 0.70247572, 0.07365551],
       [0.10005257, 0.10001601, 0.59893558, 0.10094327, 0.10005257],
       [0.07330893, 0.70689708, 0.07324889, 0.07323617, 0.07330893]])