In [None]:
import pandas as pd

# Load the dataset
file_path = 'Raw DataSet.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
data.head()


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Check for missing values
missing_values = data.isnull().sum()

# Handle missing values (if any)
data = data.dropna()  # For simplicity, dropping rows with missing values

# Encoding categorical variables
categorical_columns = ['gender', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'Contract', 'Churn']
data_encoded = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

# Split the dataset into training and testing sets
X = data_encoded.drop('MonthlyCharges', axis=1)
y = data_encoded['MonthlyCharges']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#import ace_tools as tools; tools.display_dataframe_to_user(name="Pre-processed Dataset", dataframe=pd.DataFrame(X_train_scaled))

# Display the first few rows of the scaled training data
pd.DataFrame(X_train_scaled).head()


In [None]:
# Create the pre-processed, training, and testing datasets for download
pre_processed_path = 'pre_processed_dataset.csv'
training_set_path = 'training_set.csv'
testing_set_path = 'testing_set.csv'

# Saving the pre-processed dataset
data_encoded.to_csv(pre_processed_path, index=False)

# Saving the training set
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X.columns)
training_set = pd.concat([X_train_scaled_df, y_train.reset_index(drop=True)], axis=1)
training_set.to_csv(training_set_path, index=False)

# Saving the testing set
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X.columns)
testing_set = pd.concat([X_test_scaled_df, y_test.reset_index(drop=True)], axis=1)
testing_set.to_csv(testing_set_path, index=False)

pre_processed_path, training_set_path, testing_set_path


('pre_processed_dataset.csv', 'training_set.csv', 'testing_set.csv')

In [None]:
# Get the number of samples in the original dataset
total_samples = data_encoded.shape[0]

# Get the number of samples in the training and testing sets
training_samples = X_train.shape[0]
testing_samples = X_test.shape[0]

total_samples, training_samples, testing_samples


(7043, 5634, 1409)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures

# Compute the correlation matrix
correlation_matrix = data_encoded.corr()

# Plot the heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap=plt.cm.Reds)
plt.show()

# Identify highly correlated features
threshold = 0.75
high_corr_pairs = [(i, j) for i in correlation_matrix.columns for j in correlation_matrix.columns
                   if i != j and correlation_matrix.loc[i, j] > threshold]

# Train a RandomForest model to get feature importances
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train_scaled, y_train)
feature_importances = pd.Series(rf.feature_importances_, index=X.columns)

# Plot feature importances
plt.figure(figsize=(12, 8))
feature_importances.sort_values().plot(kind='barh')
plt.show()

# Selecting important features
selected_features = feature_importances[feature_importances > 0.01].index
X_train_selected = X_train_scaled[:, feature_importances > 0.01]
X_test_selected = X_test_scaled[:, feature_importances > 0.01]

#import ace_tools as tools; tools.display_dataframe_to_user(name="Selected Features", dataframe=pd.DataFrame(X_train_selected))

# Generate polynomial features
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_train_poly = poly.fit_transform(X_train_selected)
X_test_poly = poly.transform(X_test_selected)

#tools.display_dataframe_to_user(name="Polynomial Features", dataframe=pd.DataFrame(X_train_poly))

# Display the first few rows of the generated polynomial features
pd.DataFrame(X_train_poly).head()


In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Determine the optimal number of clusters using the elbow method
wcss = []
max_clusters = 10

for i in range(1, max_clusters + 1):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=42)
    kmeans.fit(X_train_poly)
    wcss.append(kmeans.inertia_)

# Plot the elbow method graph
plt.figure(figsize=(10, 6))
plt.plot(range(1, max_clusters + 1), wcss, marker='o', linestyle='--')
plt.title('Elbow Method for Optimal Number of Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.show()


In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns

# Train the K-Means clustering model with 3 clusters
kmeans = KMeans(n_clusters=3, init='k-means++', max_iter=300, n_init=10, random_state=42)
clusters = kmeans.fit_predict(X_train_poly)

# Add the cluster labels to the dataset
X_train_clustered = pd.DataFrame(X_train_poly)
X_train_clustered['Cluster'] = clusters

# Visualize the clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(data=X_train_clustered, x=0, y=1, hue='Cluster', palette='viridis')
plt.title('Clusters Visualization')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.legend(title='Cluster')
plt.show()

#import ace_tools as tools; tools.display_dataframe_to_user(name="Clustered Dataset", dataframe=X_train_clustered)


In [None]:
from sklearn.decomposition import PCA

# Apply PCA to reduce to 2 dimensions for visualization
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_poly)

# Add the cluster labels to the PCA-transformed data
X_train_pca_df = pd.DataFrame(X_train_pca, columns=['PCA1', 'PCA2'])
X_train_pca_df['Cluster'] = clusters

# Visualize the clusters in the PCA-transformed space
plt.figure(figsize=(10, 6))
sns.scatterplot(data=X_train_pca_df, x='PCA1', y='PCA2', hue='Cluster', palette='viridis')
plt.title('Clusters Visualization with PCA')
plt.xlabel('PCA1')
plt.ylabel('PCA2')
plt.legend(title='Cluster')
plt.show()


In [None]:
import numpy as np

# Retrieve the centroids of the clusters
centroids = kmeans.cluster_centers_

# No need to apply PCA inverse transform as centroids are already in the original feature space
centroids_df = pd.DataFrame(centroids, columns=[f'Feature{i+1}' for i in range(centroids.shape[1])])

# Add cluster labels
centroids_df['Cluster'] = [0, 1, 2]

centroids_df

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Create a dataframe with the original features and cluster labels
X_train_original_df = pd.DataFrame(X_train, columns=X.columns)
X_train_original_df['Cluster'] = clusters

# Visualize the distribution of key features within each cluster
for feature in X_train_original_df.columns[:-1]:
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='Cluster', y=feature, data=X_train_original_df, palette='viridis')
    plt.title(f'Distribution of {feature} by Cluster')
    plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Create a pair plot of the key features colored by cluster
sns.pairplot(X_train_original_df, hue='Cluster', palette='viridis', diag_kind='kde')
plt.suptitle('Pair Plot of Key Features by Cluster', y=1.02)
plt.show()


In [None]:
# List of key features to visualize
key_features = ['tenure', 'MonthlyCharges']  # Verify if 'MonthlyCharges' is the correct column name

for feature in key_features:
    plt.figure(figsize=(10, 6))
    # Check if the feature exists in the DataFrame before plotting
    if feature in X_train_original_df.columns:
        sns.histplot(data=X_train_original_df, x=feature, hue='Cluster', multiple='stack', palette='viridis', kde=True)
        plt.title(f'Distribution of {feature} by Cluster')
        plt.xlabel(feature)
        plt.ylabel('Frequency')
        plt.show()
    else:
        print(f"Warning: Feature '{feature}' not found in the DataFrame.")

In [None]:
# Create a heatmap for the entire dataset to see the correlation between features
plt.figure(figsize=(12, 8))
sns.heatmap(X_train_original_df.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Heatmap of Feature Correlations')
plt.show()

# Create heatmaps for each cluster to see the correlation within clusters
for cluster in X_train_original_df['Cluster'].unique():
    plt.figure(figsize=(12, 8))
    sns.heatmap(X_train_original_df[X_train_original_df['Cluster'] == cluster].corr(), annot=True, cmap='coolwarm', fmt=".2f")
    plt.title(f'Heatmap of Feature Correlations for Cluster {cluster}')
    plt.show()
