In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

# Load the Excel file
file_path = 'test.xlsx'
data = pd.read_excel(file_path)

# Convert non-string entries to strings
data['Text'] = data['Text'].fillna('').astype(str)

# Encode the text data into numerical features using TF-IDF
vectorizer = TfidfVectorizer(max_features=100)
X = vectorizer.fit_transform(data['Text'])

# Encode the target variable
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data['Type'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict on the training set
y_train_pred = model.predict(X_train)
print("Training predictions:", y_train_pred)

# Predictions on the test set
y_test_pred = model.predict(X_test)

# Calculate accuracy, precision, recall, and F1 score
accuracy = accuracy_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred, average='weighted')
recall = recall_score(y_test, y_test_pred, average='weighted')
f1 = f1_score(y_test, y_test_pred, average='weighted')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# Perform K-Means clustering (ignore the target variable)
kmeans = KMeans(n_clusters=2, random_state=0, n_init="auto").fit(X_train)

# Cluster Labels and cluster centers
labels = kmeans.labels_
centers = kmeans.cluster_centers_

# Perform K-Means clustering with explicit n_init parameter
kmeans = KMeans(n_clusters=2, n_init=10, random_state=42).fit(X_train)

# Get the labels from the clustering
labels = kmeans.labels_

# Convert sparse matrix to dense for metric calculations
X_train_dense = X_train.toarray()

# Calculate clustering evaluation metrics
silhouette = silhouette_score(X_train_dense, labels)
calinski_harabasz = calinski_harabasz_score(X_train_dense, labels)
davies_bouldin = davies_bouldin_score(X_train_dense, labels)

print(f"Silhouette Score: {silhouette}")
print(f"Calinski-Harabasz Score: {calinski_harabasz}")
print(f"Davies-Bouldin Score: {davies_bouldin}")

# Evaluate different values of k
for k in range(2, 10):
    kmeans = KMeans(n_clusters=k, random_state=0, n_init="auto").fit(X_train)
    silhouette = silhouette_score(X_train, kmeans.labels_)
    print(f"Silhouette Score for k={k}: {silhouette}")


Training predictions: [2 2 0 0 1 2 2 2 1 1 2 2 2 0 1 0 2 2 2 2 2 2 0 2 1 2 2 2 0 0 2 0 2 2 2 0 2
 2 1 1 1 0 0 0 2 2 2 0 2 1 0 0 0 0 2 0 2 1 0 2 0 2 1 0 0 0 1 1 0 2 0 0 0 2
 0 1 1 2 2 2 1 1 1 0 1 1 0 2 2 2 0 0 1 1 0 2 2 1 0 2 0 2 0 0 1 1 2 0 2 2 1
 2 0 2 2 2 2 2 1 1 2 2 0 1 2 0 2 2 0 1 0 0 2 2 2 1 0 0 2 0 0 2 1 2 0 0 1 1
 2 1 2 2 1 2 2 2 0 0 2 0 1 2 0 0 0 1 2 0 0 0 2 2 2 2 2 1 1 0 0 1 0 0 0 1 2
 0 2 2 0 1 2 2 2 0 1 2 2 0 0 2 2 2 2 0 1 1 0 1 1 0 0 1 2 1 2 1 2 2 1 1 2 1
 2 1 2 0 1 2 2 1 1 1 0 2 2 0 0 2 2 0 2 0 1 2 0 1 2 1 0 2 0 2 2 1 0 1 2 1 2
 0 1 0 0 0 0 0 2 2 2 2 2 1 1 2 0 2 0 2 2 2 0 0 1 1 1 2 0 1 1 2 0 1 2 0 2 2
 2 2 0 2 2 2 2 2 2 1 1 1 0 2 2 2 0 0 0 2 2 1 2 2 0 2 1 1 1 0 0 2 2 0 0 0 2
 0 2 1 0 2 1 2 0 1 1 0 1 2 0 2 1 2 0 2 0 2 1 2 2 1 2 2 2 1 0 2 1 2 2 1 2 1
 2 2 0 2 0 2 2 2 2 1 0 0 2 0 2 0 2 2 0 0 0 2 1 0 0 0 1 2 0 0 2 0 1 1 2 2 2
 2 0 2 2 0 1 2 2 1 0 2 2 0 2 0 1 2 2 2 1 0 2 2 2 0 0 0 1 0 1 2 2 2 2 1 2 2
 2 0 2 0 0 2 0 1 2 2 2 2 1 2 1 2 1 0 2 2 2 1 2 0 0 2 2 0 2 1 2 0 2 2 2 1 2
 2 