In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os

# Valence Classification

# Multiple factor linear regression

In [None]:
data = pd.read_csv(os.path.join('Data', 'data.csv'))
data.head()

In [None]:
valence_data=data.drop(["artists","id","name","release_date","year","explicit","mode","popularity"],axis=1)
valence_data.head()

In [None]:
X = valence_data.drop("valence",axis=1)
y = valence_data["valence"].values.reshape(-1, 1)
print(X.shape, y.shape)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
 from sklearn.preprocessing import StandardScaler

# Create a StandardScater model and fit it to the training data

### BEGIN SOLUTION
X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train)

In [None]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [None]:
model.fit(X_train_scaled, y_train_scaled)
training_score = model.score(X_train_scaled, y_train_scaled)
testing_score = model.score(X_test_scaled, y_test_scaled)

print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

In [None]:
predictions = model.predict(X_test_scaled)
model.fit(X_train_scaled, y_train_scaled)
plt.scatter(model.predict(X_train_scaled), model.predict(X_train_scaled) - y_train_scaled, c="blue", label="Training Data")
plt.scatter(model.predict(X_test_scaled), model.predict(X_test_scaled) - y_test_scaled, c="orange", label="Testing Data")
plt.legend()
plt.hlines(y=0, xmin=y_test_scaled.min(), xmax=y_test_scaled.max())
plt.title("Residual Plot")
plt.show()

In [None]:
print(f"First 10 Predictions:   {predictions[:10]}")
print(f"First 10 Actual labels: {y_test[:10].tolist()}")


In [None]:
from sklearn.metrics import mean_squared_error

MSE = mean_squared_error(y_test_scaled, predictions)
r2 = model.score(X_test_scaled, y_test_scaled)
### END SOLUTION

print(f"MSE: {MSE}, R2: {r2}")

# Logistic regression

In [None]:
valence_new=data.drop(["artists","id","name","release_date","year","explicit","mode","popularity"],axis=1)
valence_new['sentiment']=np.where(valence_new['valence']>0.5,"positive","negative")
valence_new=valence_new.drop("valence",axis=1)
valence_new=valence_new.dropna()
valence_new.head()

In [None]:
X = valence_new.drop("sentiment", axis=1)
y = valence_new["sentiment"]
print(X.shape, y.shape)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
# Scale your data
from sklearn.preprocessing import MinMaxScaler
X_Scaler=MinMaxScaler().fit(X_train)
X_train_scaled = X_Scaler.transform(X_train)
X_test_scaled = X_Scaler.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

In [None]:
classifier.fit(X_train_scaled, y_train)

In [None]:
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

In [None]:
predictions = classifier.predict(X_test_scaled)
print(f"First 10 Predictions:   {predictions[:10]}")
print(f"First 10 Actual labels: {y_test[:10].tolist()}")

In [None]:
pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import plot_confusion_matrix

plot_confusion_matrix(classifier, X_test_scaled, y_test, cmap=plt.cm.Blues)

plt.show()

# Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {'penalty': ['l1','l2'],
              'C': [0.0001, 0.0005, 0.001, 0.005]}
grid = GridSearchCV(classifier, param_grid, verbose=3)

In [None]:
grid.fit(X_train_scaled, y_train)

In [None]:
print(grid.best_params_)
print(grid.best_score_)

# Random Forest

In [None]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train_scaled, y_train)
clf.score(X_test_scaled, y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train_scaled, y_train)
rf.score(X_test_scaled, y_test)

In [None]:
feature_names = valence_new.columns

In [None]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

# Hyperparameter Tuning

In [None]:
rf.get_params().keys()

In [None]:
from sklearn.model_selection import GridSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 400, num = 2)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 20, num = 2)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]
# Method of selecting samples for training each tree
bootstrap = [True, False]

In [None]:
param_grid = {'n_estimators': n_estimators,
               
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               }
grid = GridSearchCV(rf, param_grid, verbose=3)

In [None]:
grid.fit(X_train_scaled, y_train)

In [None]:
print(grid.best_params_)
print(grid.best_score_)

# KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
from sklearn.preprocessing import StandardScaler

X_scaler = StandardScaler().fit(X_train)

In [None]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
train_scores = []
test_scores = []
for k in range(1, 20, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    train_score = knn.score(X_train_scaled, y_train)
    test_score = knn.score(X_test_scaled, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
    
    
plt.plot(range(1, 20, 2), train_scores, marker='o')
plt.plot(range(1, 20, 2), test_scores, marker="x")
plt.xlabel("k neighbors")
plt.ylabel("Testing accuracy Score")
plt.show()

In [None]:
knn = KNeighborsClassifier(n_neighbors=17)
knn.fit(X_train_scaled, y_train)
print('k=17 Test Acc: %.3f' % knn.score(X_test_scaled, y_test))

# Hyperparameter tuning

In [None]:
knn.get_params().keys()

In [None]:
#List Hyperparameters that we want to tune.
leaf_size = [int(x) for x in np.linspace(start = 1, stop = 3, num = 3)]
n_neighbors = [int(x) for x in np.linspace(start = 1, stop = 2, num = 2)]
p=[1,2]
#Convert to dictionary
#hyperparameters = dict(leaf_size=leaf_size, n_neighbors=n_neighbors, p=p)
#Create new KNN object
knn_2 = KNeighborsClassifier()
#Use GridSearch
clf = GridSearchCV(knn_2, hyperparameters, cv=10)


In [None]:
param_grid = {'leaf_size': leaf_size,
               
               'n_neighbors': n_neighbors,
               'p': p,
               }
grid = GridSearchCV(knn, param_grid, verbose=3)

In [None]:
#Fit the model
grid.fit(X_train_scaled, y_train)

In [None]:
print(grid.best_params_)
print(grid.best_score_)

# SVM

In [None]:
X = valence_new.drop("sentiment", axis=1)
y = valence_new["sentiment"]
print(X.shape, y.shape)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
from sklearn.svm import SVC 
model = SVC(kernel='linear')

In [None]:
from sklearn.preprocessing import MinMaxScaler
X_Scaler=MinMaxScaler().fit(X_train)
X_train_scaled = X_Scaler.transform(X_train)
X_test_scaled = X_Scaler.transform(X_test)

In [None]:
model.fit(X_train_scaled, y_train)

In [None]:
print(f"Training Data Score: {model.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model.score(X_test_scaled, y_test)}")

# Hypermeter tuning

In [None]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 5, 10],
              'gamma': [0.0001, 0.001, 0.01]}
grid = GridSearchCV(model, param_grid, verbose=3)

In [None]:
grid.fit(X_train_scaled, y_train)

In [None]:
print(grid.best_params_)
print(grid.best_score_)

# Explicit

# Logistic regression

In [None]:
explicit_new=data.drop(["artists","id","name","release_date","year","mode","popularity"],axis=1)
explicit_new=explicit_new.dropna()
explicit_new.head()

In [None]:
X = explicit_new.drop("explicit", axis=1)
y = explicit_new["explicit"]
print(X.shape, y.shape)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
# Scale your data
from sklearn.preprocessing import MinMaxScaler
X_Scaler=MinMaxScaler().fit(X_train)
X_train_scaled = X_Scaler.transform(X_train)
X_test_scaled = X_Scaler.transform(X_test)


In [None]:
from sklearn.linear_model import LogisticRegression
classifier2 = LogisticRegression()
classifier2

In [None]:
classifier2.fit(X_train_scaled, y_train)

In [None]:
print(f"Training Data Score: {classifier2.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier2.score(X_test_scaled, y_test)}")

In [None]:
predictions = classifier2.predict(X_test_scaled)
print(f"First 10 Predictions:   {predictions[:10]}")
print(f"First 10 Actual labels: {y_test[:10].tolist()}")

In [None]:
pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import plot_confusion_matrix

plot_confusion_matrix(classifier2, X_test_scaled, y_test, cmap=plt.cm.Blues)

plt.show()

# Random forest

In [None]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train_scaled, y_train)
clf.score(X_test_scaled, y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train_scaled, y_train)
rf.score(X_test_scaled, y_test)

In [None]:
feature_names = explicit_new.columns

In [None]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

# KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
from sklearn.preprocessing import StandardScaler

X_scaler = StandardScaler().fit(X_train)

In [None]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
train_scores = []
test_scores = []
for k in range(1, 20, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    train_score = knn.score(X_train_scaled, y_train)
    test_score = knn.score(X_test_scaled, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
    
    
plt.plot(range(1, 20, 2), train_scores, marker='o')
plt.plot(range(1, 20, 2), test_scores, marker="x")
plt.xlabel("k neighbors")
plt.ylabel("Testing accuracy Score")
plt.show()

In [None]:
knn = KNeighborsClassifier(n_neighbors=11)
knn.fit(X_train_scaled, y_train)
print('k=11 Test Acc: %.3f' % knn.score(X_test_scaled, y_test))

# SVM

In [None]:
X = explicit_new.drop("explicit", axis=1)
y = explicit_new["explicit"]
print(X.shape, y.shape)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
from sklearn.svm import SVC 
model = SVC(kernel='linear')

In [None]:
from sklearn.preprocessing import MinMaxScaler
X_Scaler=MinMaxScaler().fit(X_train)
X_train_scaled = X_Scaler.transform(X_train)
X_test_scaled = X_Scaler.transform(X_test)

In [None]:
model.fit(X_train_scaled, y_train)

In [None]:
print(f"Training Data Score: {model.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model.score(X_test_scaled, y_test)}")

# Mode

# Logistic regression

In [None]:
mode_new=data.drop(["artists","id","name","release_date","explicit","year","popularity"],axis=1)
mode_new=mode_new.dropna()
mode_new.head()

In [None]:
X = mode_new.drop("mode", axis=1)
y = mode_new["mode"]
print(X.shape, y.shape)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
# Scale your data
from sklearn.preprocessing import MinMaxScaler
X_Scaler=MinMaxScaler().fit(X_train)
X_train_scaled = X_Scaler.transform(X_train)
X_test_scaled = X_Scaler.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression
classifier3 = LogisticRegression()
classifier3

In [None]:
classifier3.fit(X_train_scaled, y_train)

In [None]:
print(f"Training Data Score: {classifier3.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier3.score(X_test_scaled, y_test)}")

In [None]:
predictions = classifier3.predict(X_test_scaled)
print(f"First 10 Predictions:   {predictions[:10]}")
print(f"First 10 Actual labels: {y_test[:10].tolist()}")

In [None]:
pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import plot_confusion_matrix

plot_confusion_matrix(classifier3, X_test_scaled, y_test, cmap=plt.cm.Blues)

plt.show()

# Random forest

In [None]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train_scaled, y_train)
clf.score(X_test_scaled, y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train_scaled, y_train)
rf.score(X_test_scaled, y_test)

In [None]:
feature_names = mode_new.columns

In [None]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

# KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
from sklearn.preprocessing import StandardScaler

X_scaler = StandardScaler().fit(X_train)

In [None]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
train_scores = []
test_scores = []
for k in range(1, 20, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    train_score = knn.score(X_train_scaled, y_train)
    test_score = knn.score(X_test_scaled, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
    
    
plt.plot(range(1, 20, 2), train_scores, marker='o')
plt.plot(range(1, 20, 2), test_scores, marker="x")
plt.xlabel("k neighbors")
plt.ylabel("Testing accuracy Score")
plt.show()

In [None]:
knn = KNeighborsClassifier(n_neighbors=13)
knn.fit(X_train_scaled, y_train)
print('k=13 Test Acc: %.3f' % knn.score(X_test_scaled, y_test))

# SVM

In [None]:
X = mode_new.drop("mode", axis=1)
y = mode_new["mode"]
print(X.shape, y.shape)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
from sklearn.svm import SVC 
model = SVC(kernel='linear')

In [None]:
from sklearn.preprocessing import MinMaxScaler
X_Scaler=MinMaxScaler().fit(X_train)
X_train_scaled = X_Scaler.transform(X_train)
X_test_scaled = X_Scaler.transform(X_test)

In [None]:
model.fit(X_train_scaled, y_train)

In [None]:
print(f"Training Data Score: {model.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model.score(X_test_scaled, y_test)}")