In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.feature_selection import RFE

## now we import data

In [None]:
# Load the data into a Pandas dataframe
df = pd.read_csv('train.csv')
X = df.drop('Activity', axis=1)
y = df['Activity']

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
nunique = X_train.nunique()
unique_features = nunique[nunique == 1].index.tolist()
X_train = X_train.drop(unique_features, axis=1)
X_test = X_test.drop(unique_features, axis=1)

In [None]:
n_features = [1773, 1500, 1200, 900, 600, 300, 100, 50, 10]

In [None]:
# Lasso
lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)
coef = pd.Series(lasso.coef_, index=X_train.columns)
coef_sorted = coef.abs().sort_values(ascending=False)

for k in n_features:
    top_k_features = coef_sorted[:k].index.tolist()
    selected_features_train = X_train[top_k_features]
    selected_features_test = X_test[top_k_features]
    svm = SVC()
    svm.fit(selected_features_train, y_train)
    svm_score = svm.score(selected_features_test, y_test)
    print("SVM accuracy with", k, "features:", svm_score)

print()
print("###############")
print()

for k in n_features:
    top_k_features = coef_sorted[:k].index.tolist()
    selected_features_train = X_train[top_k_features]
    selected_features_test = X_test[top_k_features]
    lr = LogisticRegression(max_iter=10000)
    lr.fit(selected_features_train, y_train)
    lr_score = lr.score(selected_features_test, y_test)
    print("Logistic Regression accuracy with", k, "features:", lr_score)

SVM accuracy with 1773 features: 0.7868561278863233
SVM accuracy with 1500 features: 0.7877442273534636
SVM accuracy with 1200 features: 0.7850799289520426
SVM accuracy with 900 features: 0.7859680284191829
SVM accuracy with 600 features: 0.7841918294849023
SVM accuracy with 300 features: 0.7770870337477798
SVM accuracy with 100 features: 0.7761989342806395
SVM accuracy with 50 features: 0.7726465364120781
SVM accuracy with 10 features: 0.7468916518650088

###############

Logistic Regression accuracy with 1773 features: 0.761101243339254
Logistic Regression accuracy with 1500 features: 0.7575488454706927
Logistic Regression accuracy with 1200 features: 0.7566607460035524
Logistic Regression accuracy with 900 features: 0.7371225577264654
Logistic Regression accuracy with 600 features: 0.7451154529307282
Logistic Regression accuracy with 300 features: 0.7406749555950266
Logistic Regression accuracy with 100 features: 0.7486678507992895
Logistic Regression accuracy with 50 features: 0.74

In [None]:
# SelectKBest
for k in n_features:
    kbest = SelectKBest(f_classif, k=k)
    kbest.fit(X_train, y_train)
    top_k_indices = kbest.get_support(indices=True)
    top_k_features = X_train.columns[top_k_indices]
    selected_features_train = X_train[top_k_features]
    selected_features_test = X_test[top_k_features]
    svm = SVC()
    svm.fit(selected_features_train, y_train)
    svm_score = svm.score(selected_features_test, y_test)
    print("SVM accuracy with", k, "features:", svm_score)

print()
print("###############")
print()

for k in n_features:
    kbest = SelectKBest(f_classif, k=k)
    kbest.fit(X_train, y_train)
    top_k_indices = kbest.get_support(indices=True)
    top_k_features = X_train.columns[top_k_indices]
    selected_features_train = X_train[top_k_features]
    selected_features_test = X_test[top_k_features]
    lr = LogisticRegression(max_iter=10000)
    lr.fit(selected_features_train, y_train)
    lr_score = lr.score(selected_features_test, y_test)
    print("Logistic Regression accuracy with", k, "features:", lr_score)

SVM accuracy with 1773 features: 0.7868561278863233
SVM accuracy with 1500 features: 0.7824156305506217
SVM accuracy with 1200 features: 0.7859680284191829
SVM accuracy with 900 features: 0.7806394316163411
SVM accuracy with 600 features: 0.7788632326820604
SVM accuracy with 300 features: 0.7761989342806395
SVM accuracy with 100 features: 0.761101243339254
SVM accuracy with 50 features: 0.7628774422735346
SVM accuracy with 10 features: 0.7513321492007105

###############

Logistic Regression accuracy with 1773 features: 0.761101243339254
Logistic Regression accuracy with 1500 features: 0.7628774422735346
Logistic Regression accuracy with 1200 features: 0.761101243339254
Logistic Regression accuracy with 900 features: 0.7548845470692718
Logistic Regression accuracy with 600 features: 0.7539964476021315
Logistic Regression accuracy with 300 features: 0.7513321492007105
Logistic Regression accuracy with 100 features: 0.7522202486678508
Logistic Regression accuracy with 50 features: 0.7531