In [None]:
import pandas as pd

# Load the dataset
file_path = 'Raw DataSet.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
data.head()


# New Section

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Check for missing values
missing_values = data.isnull().sum()

# Handle missing values (if any)
data = data.dropna()  # For simplicity, dropping rows with missing values

# Encoding categorical variables
categorical_columns = ['gender', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'Contract', 'Churn']
data_encoded = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

# Split the dataset into training and testing sets
X = data_encoded.drop('MonthlyCharges', axis=1)
y = data_encoded['MonthlyCharges']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#import ace_tools as tools; tools.display_dataframe_to_user(name="Pre-processed Dataset", dataframe=pd.DataFrame(X_train_scaled))

# Display the first few rows of the scaled training data
pd.DataFrame(X_train_scaled).head()


In [None]:
# Create the pre-processed, training, and testing datasets for download
pre_processed_path = 'pre_processed_dataset.csv'
training_set_path = 'training_set.csv'
testing_set_path = 'testing_set.csv'

# Saving the pre-processed dataset
data_encoded.to_csv(pre_processed_path, index=False)

# Saving the training set
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X.columns)
training_set = pd.concat([X_train_scaled_df, y_train.reset_index(drop=True)], axis=1)
training_set.to_csv(training_set_path, index=False)

# Saving the testing set
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X.columns)
testing_set = pd.concat([X_test_scaled_df, y_test.reset_index(drop=True)], axis=1)
testing_set.to_csv(testing_set_path, index=False)

pre_processed_path, training_set_path, testing_set_path


In [None]:
# Get the number of samples in the original dataset
total_samples = data_encoded.shape[0]

# Get the number of samples in the training and testing sets
training_samples = X_train.shape[0]
testing_samples = X_test.shape[0]

total_samples, training_samples, testing_samples


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures

# Compute the correlation matrix
correlation_matrix = data_encoded.corr()

# Plot the heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap=plt.cm.Reds)
plt.show()

# Identify highly correlated features
threshold = 0.75
high_corr_pairs = [(i, j) for i in correlation_matrix.columns for j in correlation_matrix.columns
                   if i != j and correlation_matrix.loc[i, j] > threshold]

# Train a RandomForest model to get feature importances
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train_scaled, y_train)
feature_importances = pd.Series(rf.feature_importances_, index=X.columns)

# Plot feature importances
plt.figure(figsize=(12, 8))
feature_importances.sort_values().plot(kind='barh')
plt.show()

# Selecting important features
selected_features = feature_importances[feature_importances > 0.01].index
X_train_selected = X_train_scaled[:, feature_importances > 0.01]
X_test_selected = X_test_scaled[:, feature_importances > 0.01]

#import ace_tools as tools; tools.display_dataframe_to_user(name="Selected Features", dataframe=pd.DataFrame(X_train_selected))

# Generate polynomial features
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_train_poly = poly.fit_transform(X_train_selected)
X_test_poly = poly.transform(X_test_selected)

#tools.display_dataframe_to_user(name="Polynomial Features", dataframe=pd.DataFrame(X_train_poly))

# Display the first few rows of the generated polynomial features
pd.DataFrame(X_train_poly).head()
