## **Preparing the work environment**

In [None]:
import pandas as pd
import re
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score,confusion_matrix,classification_report
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from gensim.models import Word2Vec

import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

## load dataset

In [None]:
train = pd.read_csv(r'/content/drive/MyDrive/Colab Notebooks/train1.csv')
test = pd.read_csv(r'/content/drive/MyDrive/Colab Notebooks/test1.csv')

## EDA

In [None]:
print('Shape of train ',train.shape)
print('Shape of test ',test.shape)

In [None]:
train.info()

In [None]:
test.info()

In [None]:
sns.countplot(x=train['target'])

In [None]:
target = train['target']
train_id = train['id']
test_id = test['id']
train.drop(['id','date','user','flag'],axis=1,inplace=True)
test.drop(['id','date','user','flag'],axis=1,inplace=True)

In [None]:
train.info()

In [None]:
test.info()

## Preprocessing
Extact text for column

1- Remove punctuation: Punctuation marks such as commas, periods, and question marks do not provide much meaning for sentiment analysis, so you can remove them from the text.

2- Convert text to lowercase: Convert all the text to lowercase so that similar words can be grouped together.

3- Tokenize the text: Tokenization is the process of splitting a sentence into words. You can use NLTK library to tokenize the text.

4- Remove stop words: Stop words are common words such as "the", "and", and "in" that do not add much meaning to the text. You can remove them using NLTK's stop words list.

5- Stemming/Lemmatization: Stemming and lemmatization are techniques to reduce the word to its root form. It can help in reducing the feature space. You can use NLTK's WordNetLemmatizer for lemmatization or PorterStemmer for stemming.

In [None]:
def preprocess_text(text):
  text = re.sub('[^a-zA-Z]',' ',text).strip()
  text = text.lower()
  tokens = word_tokenize(text)
  filtered_tokens = [token for token in tokens if token not in stop_words and token.isalpha()]
  stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
  return " ".join(stemmed_tokens)

In [None]:
train['text'] = train['text'].apply(lambda x : preprocess_text(x))
test['text'] = test['text'].apply(lambda x : preprocess_text(x))

### convert Data frame to numpy array and transform it to vector

In [None]:
X_train , X_val = train_test_split(train,test_size=.1,random_state=2)

In [None]:
cv = TfidfVectorizer(ngram_range=(1,3), analyzer = 'word', use_idf=True, smooth_idf=True, sublinear_tf=True)
df_train = cv.fit_transform(X_train['text'])
df_val = cv.transform(X_val['text'])
df_test = cv.transform(test['text'])

In [None]:
print('df_train shape ',df_train.shape)
print('df_train shape ',df_val.shape)

In [None]:
model = KMeans(n_clusters=2,init='random',n_init=1000)

In [None]:
model.fit(df_train)

In [None]:
df_val

In [None]:
pred = model.predict(df_val)
conf_mat = confusion_matrix(X_val.target, pred)
print(conf_mat)

In [None]:
ax = sns.heatmap(conf_mat/np.sum(conf_mat), annot=True,
             fmt='.2%', cmap='Blues')

ax.set_title('Seaborn Confusion Matrix with labels\n\n');
ax.set_xlabel('\nPredicted Values')
ax.set_ylabel('Actual Values ');

# ## Ticket labels - List must be in alphabetical order
ax.xaxis.set_ticklabels(['False','True'])
ax.yaxis.set_ticklabels(['False','True'])

# ## Display the visualization of the Confusion Matrix.
plt.show()

In [None]:
print(classification_report(X_val.target, pred))

In [None]:
predictions = model.predict(df_test)

In [None]:
output = pd.DataFrame({'Id':test_id,
                        'target': predictions})
output.to_csv('kmeans2MMM.csv', index=False)

print("Your submission was successfully saved!")

The message "PCA does not support sparse input. See TruncatedSVD for a possible alternative" means that the PCA algorithm in scikit-learn does not accept sparse input data.

Sparse data refers to data that has a large number of zero values. In natural language processing, for example, sparse data is common when working with text data that has been converted to a bag-of-words representation.

TruncatedSVD is an alternative algorithm that can be used for dimensionality reduction on sparse data. TruncatedSVD is similar to PCA, but specifically designed to handle sparse matrices as input. The TruncatedSVD algorithm is a variant of singular value decomposition (SVD) that can be used to reduce the dimensionality of sparse data.

In [None]:
pca = PCA(n_components=100)
pca.fit(df_train)
X_train_reduce = pca.transform(df_train)
X_val_reduce = pca.transform(df_val)

## Reduce the dimensionality using TruncatedSVD

In [None]:
svd = TruncatedSVD(n_components=1000)
X_train_reduce = svd.fit_transform(df_train)
X_val_reduce = svd.fit_transform(df_val)
test_reduce = svd.fit_transform(df_test)

In [None]:
model.fit(X_train_reduce)

In [None]:
pred = model.predict(X_val_reduce)
conf_mat = confusion_matrix(X_val.target, pred)
print(conf_mat)

In [None]:
ax = sns.heatmap(conf_mat/np.sum(conf_mat), annot=True,
            fmt='.2%', cmap='Blues')

ax.set_title('Seaborn Confusion Matrix with labels\n\n');
ax.set_xlabel('\nPredicted Values')
ax.set_ylabel('Actual Values ');


ax.xaxis.set_ticklabels(['False','True'])
ax.yaxis.set_ticklabels(['False','True'])

## Display the visualization of the Confusion Matrix.
plt.show()

In [None]:
print(classification_report(X_val.target, pred))

In [None]:
predictions = model.predict(test_reduce)
output = pd.DataFrame({'Id':test_id,
                       'target': predictions})
output.to_csv('kmeans2.csv', index=False)

print("Your submission was successfully saved!")

## Hierarchical Clustering

Using the reduced data

In [None]:
model2 = AgglomerativeClustering(n_clusters=2, linkage='ward')
model2.fit(X_train_reduce)

In [None]:
pred = model.predict(X_val_reduce)
conf_mat = confusion_matrix(X_val.target, pred)
print(conf_mat)

In [None]:
ax = sns.heatmap(conf_mat/np.sum(conf_mat), annot=True,
                 fmt='.2%', cmap='Blues')

ax.set_title('Seaborn Confusion Matrix with labels\n\n');
ax.set_xlabel('\nPredicted Values')
ax.set_ylabel('Actual Values ');

## Ticket labels - List must be in alphabetical order
ax.xaxis.set_ticklabels(['False','True'])
ax.yaxis.set_ticklabels(['False','True'])

## Display the visualization of the Confusion Matrix.
plt.show()

In [None]:
print(classification_report(X_val.target, pred))

In [None]:
predictions = model.predict(test_reduce)
output = pd.DataFrame({'Id':test_id,
                       'target': predictions})
output.to_csv('kmeans3.csv', index=False)

print("Your submission was successfully saved!")

using the data without reduced

In [None]:
model2.fit(df_train.toarray())

In [None]:
pred = model.predict(df_val)
conf_mat = confusion_matrix(X_val.target, pred)
print(conf_mat)

In [None]:
ax = sns.heatmap(conf_mat/np.sum(conf_mat), annot=True,
             fmt='.2%', cmap='Blues')

ax.set_title('Seaborn Confusion Matrix with labels\n\n');
ax.set_xlabel('\nPredicted Values')
ax.set_ylabel('Actual Values ');

## Ticket labels - List must be in alphabetical order
ax.xaxis.set_ticklabels(['False','True'])
ax.yaxis.set_ticklabels(['False','True'])

## Display the visualization of the Confusion Matrix.
plt.show()

In [None]:
print(classification_report(X_val.target, pred))

In [None]:
predictions = model.predict(df_test)
output = pd.DataFrame({'Id':test_id,
                      'target': predictions})
output.to_csv('kmeans4.csv', index=False)

print("Your submission was successfully saved!")