# Import everything

In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import matplotlib.pyplot as plt
import tensorflow as tf

## Data Normalization

In [28]:
df = pd.read_csv('finaltrygood.csv', low_memory=False)

Get only the relevant columns

In [29]:
relevant_columns = [
    'Draw1', 'Draw2', 'Draw3', 'Max', 'longitude', 'latitude', 'township',
    'nbhd', 'tract_pop', 'tract_white_perc', 'ward', 'Property Class',
    'Land Square Feet', 'Type of Residence', 'Apartments', 'Wall Material',
    'Roof Material', 'Rooms', 'Bedrooms', 'Basement', 'Construction Quality',
    'Site Desireability', 'Building Square Feet', 'Repair Condition',
    'Age', 'Near Major Road', 'Location Factor', 'Garage indicator',
    'Private Service Line Material', 'Public Service Line Material'
]

In [30]:
# Using this to make simpler

relevant_columns = [
    'Draw1', 'Draw2', 'Draw3', 'Max', 'longitude', 'latitude',
    'Private Service Line Material', 'Public Service Line Material'
]

In [31]:
df = df[relevant_columns]

In [32]:
# Using only "LEAD" and "NOT LEAD"
df = df[(df['Private Service Line Material'].isin(['LEAD', 'NOT LEAD'])) &
        (df['Public Service Line Material'].isin(['LEAD', 'NOT LEAD']))]


numeric columns with median

In [33]:
numeric_columns = df.select_dtypes(include=[np.number]).columns
categorical_columns = df.select_dtypes(include=[object]).columns

numeric_imputer = SimpleImputer(strategy='median')
df[numeric_columns] = numeric_imputer.fit_transform(df[numeric_columns])


categorical variables

In [36]:
le = LabelEncoder()
for col in categorical_columns:
    df[col] = le.fit_transform(df[col].astype(str))

Normalize numeric features

In [37]:
scaler = MinMaxScaler()
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])

-----------------------------------------------------------------------

# **K-means cluster**

In [38]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

X = df[relevant_columns].dropna()

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply K-means clustering
n_clusters = 5  # You can adjust this number
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
df['Cluster'] = kmeans.fit_predict(X_scaled)

# Analyze the results
cluster_centers = scaler.inverse_transform(kmeans.cluster_centers_)
cluster_summary = df.groupby('Cluster')[relevant_columns].mean()

print("Cluster Centers:")
print(cluster_centers)
print("\nCluster Summary:")
print(cluster_summary)

Cluster Centers:
[[5.11925702e-03 7.59339806e+01 1.22899192e-01 8.17864078e+01
  4.07810289e-01 7.91848541e-01 9.30097087e-01 6.85436893e-01]
 [4.22971947e-03 5.52222222e+01 8.98295767e-02 5.60163399e+01
  6.88036850e-01 2.61461287e-01 6.60130719e-01 3.26797386e-01]
 [1.75123902e-03 2.33590308e+01 3.33360781e-02 2.33061674e+01
  4.15733426e-01 7.83849123e-01 6.27753304e-01 9.69162996e-02]
 [5.41490281e-04 2.84803150e+00 1.36519244e-02 6.33228346e+00
  5.94905709e-01 6.83006010e-01 9.97637795e-01 9.97637795e-01]
 [8.87853300e-01 2.26666667e+01 6.88473520e-01 4.90000000e+01
  7.51363883e-01 4.40953551e-01 6.66666667e-01 3.33333333e-01]]

Cluster Summary:
            Draw1      Draw2     Draw3        Max  longitude  latitude  \
Cluster                                                                  
0        0.005122  76.040856  0.122967  81.898833   0.408139  0.791963   
1        0.004230  55.222222  0.089830  56.016340   0.688037  0.261461   
2        0.001751  23.359031  0.033336  23.

In [39]:
from sklearn.metrics import silhouette_score, calinski_harabasz_score
import matplotlib.pyplot as plt

kmeans = KMeans(n_clusters=5, random_state=42)
cluster_labels = kmeans.fit_predict(X)

# Silhouette Score
silhouette_avg = silhouette_score(X, cluster_labels)
print(f"Silhouette Score: {silhouette_avg}")

# Calinski-Harabasz Index
ch_score = calinski_harabasz_score(X, cluster_labels)
print(f"Calinski-Harabasz Score: {ch_score}")

# Elbow Method
inertias = []
k_range = range(1, 11)
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X)
    inertias.append(kmeans.inertia_)



Silhouette Score: 0.7212735368195415
Calinski-Harabasz Score: 9461.398137659391


# **Neural Networks with Dense and Dropout**

In [40]:
X = df.drop(['Private Service Line Material', 'Public Service Line Material'], axis=1)
y_private = df['Private Service Line Material']
y_public = df['Public Service Line Material']

In [None]:
# Train the data
X_train, X_test, y_private_train, y_private_test, y_public_train, y_public_test = train_test_split(
    X, y_private, y_public, test_size=0.2, random_state=42
)

# Forgot to import
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dense, Dropout

# Convert targets to categorical
y_private_train_cat = to_categorical(y_private_train)
y_private_test_cat = to_categorical(y_private_test)
y_public_train_cat = to_categorical(y_public_train)
y_public_test_cat = to_categorical(y_public_test)

# Define the model
def create_model1(input_shape, output_shape):
    model = Sequential([
        Dense(64, activation='relu', input_shape=(input_shape,)),
        Dropout(0.2),
        Dense(32, activation='relu'),
        Dropout(0.2),
        Dense(16, activation='relu'),
        Dense(output_shape, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model


# Train models for private and public service lines
private_model = create_model1(X_train.shape[1], y_private_train_cat.shape[1])
public_model = create_model1(X_train.shape[1], y_public_train_cat.shape[1])

private_model.fit(X_train, y_private_train_cat, epochs=10, batch_size=32, validation_split=0.2, verbose=1)
print("\n--------------------------------------------------------------------------------------------\n")
public_model.fit(X_train, y_public_train_cat, epochs=10, batch_size=32, validation_split=0.2, verbose=1)

# Evaluate models
private_pred = private_model.predict(X_test)
public_pred = public_model.predict(X_test)

private_pred_classes = np.argmax(private_pred, axis=1)
public_pred_classes = np.argmax(public_pred, axis=1)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.4116 - loss: 5.1553 - val_accuracy: 0.8676 - val_loss: 0.9581
Epoch 2/10
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8330 - loss: 0.9012 - val_accuracy: 0.8676 - val_loss: 0.6183
Epoch 3/10
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8597 - loss: 0.5625 - val_accuracy: 0.8676 - val_loss: 0.5200
Epoch 4/10
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8417 - loss: 0.4765 - val_accuracy: 0.8676 - val_loss: 0.4810
Epoch 5/10
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8634 - loss: 0.4161 - val_accuracy: 0.8676 - val_loss: 0.4203
Epoch 6/10
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8489 - loss: 0.4118 - val_accuracy: 0.8676 - val_loss: 0.4028
Epoch 7/10
[1m51/51[0m [32m━━━━━━━━━━

# **Neaural Networks model-2**

In [42]:
def create_model2(input_shape, output_shape):
    model = Sequential([
        Dense(128, activation='relu', input_shape=(input_shape,)),
        Dropout(0.3),  # Increase dropout slightly to prevent overfitting
        Dense(64, activation='relu'),
        Dropout(0.3),
        Dense(32, activation='relu'),
        Dense(output_shape, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model


# Train models for private and public service lines
private_model = create_model2(X_train.shape[1], y_private_train_cat.shape[1])
public_model = create_model2(X_train.shape[1], y_public_train_cat.shape[1])

private_model.fit(X_train, y_private_train_cat, epochs=10, batch_size=32, validation_split=0.2, verbose=1)
print("\n--------------------------------------------------------------------------------------------\n")
public_model.fit(X_train, y_public_train_cat, epochs=10, batch_size=32, validation_split=0.2, verbose=1)

# Evaluate models
private_pred = private_model.predict(X_test)
public_pred = public_model.predict(X_test)

private_pred_classes = np.argmax(private_pred, axis=1)
public_pred_classes = np.argmax(public_pred, axis=1)


Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.8074 - loss: 1.3769 - val_accuracy: 0.6691 - val_loss: 0.8149
Epoch 2/10
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8354 - loss: 0.6216 - val_accuracy: 0.8676 - val_loss: 0.3712
Epoch 3/10
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8714 - loss: 0.4475 - val_accuracy: 0.8480 - val_loss: 0.3782
Epoch 4/10
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8560 - loss: 0.4402 - val_accuracy: 0.8676 - val_loss: 0.3837
Epoch 5/10
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8651 - loss: 0.3869 - val_accuracy: 0.8676 - val_loss: 0.3595
Epoch 6/10
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8538 - loss: 0.4054 - val_accuracy: 0.8676 - val_loss: 0.3577
Epoch 7/10
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━

# **LeakyRelU**

In [43]:
from keras.layers import LeakyReLU

def create_model3(input_shape, output_shape):
    model = Sequential([
        Dense(128),
        LeakyReLU(alpha=0.01),  # Instead of 'relu'
        Dropout(0.3),
        Dense(64),
        LeakyReLU(alpha=0.01),
        Dropout(0.3),
        Dense(32),
        LeakyReLU(alpha=0.01),
        Dense(output_shape, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model


# Train models for private and public service lines
private_model = create_model3(X_train.shape[1], y_private_train_cat.shape[1])
public_model = create_model3(X_train.shape[1], y_public_train_cat.shape[1])

private_model.fit(X_train, y_private_train_cat, epochs=10, batch_size=32, validation_split=0.2, verbose=1)
print("\n--------------------------------------------------------------------------------------------\n")
public_model.fit(X_train, y_public_train_cat, epochs=10, batch_size=32, validation_split=0.2, verbose=1)

# Evaluate models
private_pred = private_model.predict(X_test)
public_pred = public_model.predict(X_test)

private_pred_classes = np.argmax(private_pred, axis=1)
public_pred_classes = np.argmax(public_pred, axis=1)

Epoch 1/10




[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.7065 - loss: 1.0632 - val_accuracy: 0.7598 - val_loss: 0.4537
Epoch 2/10
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8409 - loss: 0.5110 - val_accuracy: 0.8676 - val_loss: 0.3977
Epoch 3/10
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8645 - loss: 0.4546 - val_accuracy: 0.8676 - val_loss: 0.3943
Epoch 4/10
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8554 - loss: 0.4102 - val_accuracy: 0.8676 - val_loss: 0.4065
Epoch 5/10
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8741 - loss: 0.3936 - val_accuracy: 0.8676 - val_loss: 0.3811
Epoch 6/10
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8848 - loss: 0.3209 - val_accuracy: 0.8676 - val_loss: 0.3856
Epoch 7/10
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━

In [44]:
# For model one only
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

print("Private Service Line Material Classification Report:")
print(classification_report(y_private_test, private_pred_classes))
print("\nPrivate Service Line Material Confusion Matrix:")
print(confusion_matrix(y_private_test, private_pred_classes))


print("\nPublic Service Line Material Classification Report:")
print(classification_report(y_public_test, public_pred_classes))
print("\nPublic Service Line Material Confusion Matrix:")
print(confusion_matrix(y_public_test, public_pred_classes))

# Function to predict service line material
def predict_service_line(model, X):
    pred = model.predict(X)
    pred_classes = np.argmax(pred, axis=1)
    return le.inverse_transform(pred_classes)

# Example prediction
sample_data = X_test.iloc[:5]  # Take first 5 samples for demonstration
private_predictions = predict_service_line(private_model, sample_data)
public_predictions = predict_service_line(public_model, sample_data)

print("\nSample Predictions:")
print("Private Service Line Material:", private_predictions)
print("Public Service Line Material:", public_predictions)

Private Service Line Material Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        63
           1       0.88      1.00      0.93       447

    accuracy                           0.88       510
   macro avg       0.44      0.50      0.47       510
weighted avg       0.77      0.88      0.82       510


Private Service Line Material Confusion Matrix:
[[  0  63]
 [  0 447]]

Public Service Line Material Classification Report:
              precision    recall  f1-score   support

           0       0.45      0.51      0.48       152
           1       0.78      0.74      0.76       358

    accuracy                           0.67       510
   macro avg       0.62      0.63      0.62       510
weighted avg       0.68      0.67      0.68       510


Public Service Line Material Confusion Matrix:
[[ 78  74]
 [ 94 264]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
