In [5]:
import requests
from bs4 import BeautifulSoup
import zipfile
import io
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import time

# Download and load dataset
def load_data():
    page_url = 'https://archive.ics.uci.edu/dataset/240/human+activity+recognition+using+smartphones'
    page_response = requests.get(page_url)
    if page_response.status_code == 200:
        soup = BeautifulSoup(page_response.content, 'html.parser')
        download_link = soup.select_one('a[href$=".zip"]')['href']
        full_download_url = 'https://archive.ics.uci.edu' + download_link
        response = requests.get(full_download_url)
        if response.status_code == 200:
            with zipfile.ZipFile(io.BytesIO(response.content)) as outer_zip:
                inner_zip_name = 'UCI HAR Dataset.zip'
                with outer_zip.open(inner_zip_name) as inner_zip_file:
                    with zipfile.ZipFile(io.BytesIO(inner_zip_file.read())) as inner_zip:
                        with inner_zip.open('UCI HAR Dataset/train/X_train.txt') as myfile:
                            df = pd.read_csv(myfile, sep='\s+', header=None)
                        with inner_zip.open('UCI HAR Dataset/train/y_train.txt') as myfile_y:
                            y = pd.read_csv(myfile_y, sep='\s+', header=None)
    else:
        raise Exception("Failed to download or parse the dataset.")
    return df, y

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans
import numpy as np
import time

# Load dataset
df, y = load_data()

print(df.head())
print(y.head())

print(df.isnull())
print(y.isnull())

print(df.shape)
print(y.shape)



In [None]:
label_encoder = LabelEncoder()
encoded_y = label_encoder.fit_transform(y.values.ravel())

In [None]:
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)

In [16]:
X_train_full, X_test_full, y_train, y_test = train_test_split(df_scaled, encoded_y, test_size=0.2, random_state=42)

In [29]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

start_time = time.time()
model = GaussianNB()
model.fit(X_train_full, y_train)
y_pred = model.predict(X_test_full)
end_time = time.time()
full_features_time = end_time - start_time
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Time taken:", full_features_time)

Accuracy: 0.7314751869476547
Time taken: 0.06988954544067383


In [None]:
from sklearn.cluster import KMeans

# K-Means for dimensionality reduction
n_clusters = 50
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
kmeans.fit(df_scaled.T)
selected_features_indices = []

# Loop through each cluster
for i in range(n_clusters):
    cluster_indices = np.where(kmeans.labels_ == i)[0]
    selected_index = np.random.choice(cluster_indices)
    selected_features_indices.append(selected_index)

selected_features = df_scaled[:, selected_features_indices]

print(selected_features_indices)
print(selected_features)

In [31]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

start_time = time.time()
model = GaussianNB()
model.fit(X_train_full[:,selected_features_indices], y_train)
y_pred = model.predict(X_test_full[:,selected_features_indices])
end_time = time.time()
full_features_time = end_time - start_time
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Time taken:", full_features_time)

Accuracy: 0.8123725356900068
Time taken: 0.025442123413085938
