In [2]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [27]:
churn_df= pd.read_csv('churn_df.csv',index_col=0)
X = churn_df.drop('churn',axis = 1).values
y = churn_df['churn'].values
knn = KNeighborsClassifier(n_neighbors = 7)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.4,random_state = 42)
knn.fit(X_train,y_train)
y_pred = knn.predict(X_test)

print(y_test.shape)
print(y_pred.shape)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

(1334,)
(1334,)
[[1106   11]
 [ 183   34]]
              precision    recall  f1-score   support

           0       0.86      0.99      0.92      1117
           1       0.76      0.16      0.26       217

    accuracy                           0.85      1334
   macro avg       0.81      0.57      0.59      1334
weighted avg       0.84      0.85      0.81      1334



In [None]:
music_df = pd.read_csv("music.csv")
music_dummies = pd.get_dummies(music_df["genre"], drop_first=True).astype(int)
# pd.get_dummies is used to convert categorical variable(s) into dummy/indicator variables.
# In this case, it converts the 'genre' column into one-hot encoded columns, dropping the first category to avoid multicollinearity.
print(music_dummies.head())

   Anime  Blues  Classical  Country  Electronic  Hip-Hop  Jazz  Rap  Rock
0      0      0          0        0           0        0     1    0     0
1      0      0          0        0           0        0     0    1     0
2      0      0          0        0           1        0     0    0     0
3      0      0          0        0           0        0     0    0     1
4      0      0          0        0           0        0     0    1     0


In [29]:
music_dummies = pd.concat([music_df, music_dummies], axis = 1)
music_dummies = music_dummies.drop('genre', axis=1)
#music_dummies.head()
print(music_dummies.columns)

X = music_dummies.drop('popularity', axis=1).values
y = music_dummies['popularity'].values

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
linear_reg = LinearRegression()
linear_reg = cross_val_score(linear_reg, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')

print(np.sqrt(-linear_reg))


Index(['Unnamed: 0', 'popularity', 'acousticness', 'danceability',
       'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness',
       'speechiness', 'tempo', 'valence', 'Anime', 'Blues', 'Classical',
       'Country', 'Electronic', 'Hip-Hop', 'Jazz', 'Rap', 'Rock'],
      dtype='object')
[8.14828914 8.63267535 7.56370022 8.61139914 7.91041513]


In [3]:
music_df = pd.read_csv('music_unclean.csv', index_col=0)
print(music_df.isna().sum().sort_values())

genre                 8
popularity           31
loudness             44
liveness             46
tempo                46
speechiness          59
duration_ms          91
instrumentalness     91
danceability        143
valence             143
acousticness        200
energy              200
dtype: int64


In [31]:
music_df = music_df.dropna(subset=["genre", "popularity", "loudness", "liveness", "tempo"])
print(music_df.isna().sum().sort_values())

popularity            0
liveness              0
loudness              0
tempo                 0
genre                 0
duration_ms          29
instrumentalness     29
speechiness          53
danceability        127
valence             127
acousticness        178
energy              178
dtype: int64


In [None]:
music_df = pd.read_csv('music_unclean.csv', index_col = 0)
#print(music_df.isna().sum().sort_values())

music_df = music_df.dropna(subset=["genre", "popularity", "loudness", "liveness", "tempo"])

print(music_df.isna().sum().sort_values())
print("Shape of the `music_df`: {}".format(music_df.shape))

# Convert genre to a binary feature
music_df["genre"] = np.where(music_df["genre"] == "Rock", 1, 0)

X = music_df.drop('genre', axis = 1).values
y = music_df['genre'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)
# Instantiate an imputer
imputer = SimpleImputer()
X_train = imputer.fit_transform(X_train) ###################
X_test = imputer.transform(X_test)          ###################

# Instantiate a knn model
knn = KNeighborsClassifier(n_neighbors=3)

# Build steps for the pipeline
steps = [("imputer", imputer), 
        ("knn", knn)]

# Create the pipeline
pipeline = Pipeline(steps)

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Print the confusion matrix

print(confusion_matrix(y_test, y_pred))
print(pipeline.score(X_test,y_test))

popularity            0
liveness              0
loudness              0
tempo                 0
genre                 0
duration_ms          29
instrumentalness     29
speechiness          53
danceability        127
valence             127
acousticness        178
energy              178
dtype: int64
Shape of the `music_df`: (892, 12)
[[78 57]
 [55 78]]
0.582089552238806


In [None]:
#print(X_train.shape)
columns = ['acousticness', 'danceability', 'duration_ms', 'energy',
        'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo',
        'valence', 'genre']
check = pd.DataFrame(X_train, columns = columns)
print(check.isna().sum().sort_values())

acousticness        0
danceability        0
duration_ms         0
energy              0
instrumentalness    0
liveness            0
loudness            0
speechiness         0
tempo               0
valence             0
genre               0
dtype: int64


In [None]:
# Load dataset
music_df = pd.read_csv("music_unclean.csv", index_col=0)

# Drop rows with missing values in critical columns
music_df = music_df.dropna(subset=["genre", "popularity", "loudness", "liveness", "tempo"])

# Convert "genre" to binary (1 for Rock, 0 otherwise)
music_df["genre"] = np.where(music_df["genre"] == "Rock", 1, 0)

# Define X (features) and y (target)
X = music_df.drop('genre', axis=1).values
y = music_df['genre'].values

X = music_df.drop('genre', axis = 1).values
y = music_df['genre'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)

# Instantiate an imputer
imputer = SimpleImputer(strategy='mean')
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

# Instantiate a knn model
knn = KNeighborsClassifier(n_neighbors=8)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

print("Confusion matrix:", confusion_matrix(y_test, y_pred))
print("Knn Score:", knn.score(X_test, y_test))

Confusion matrix: [[82 53]
 [72 61]]
Knn Score: 0.5335820895522388
