In [None]:
!gdown 1k-uSHaMEagMkORIRr2oJ0dtOfFcbyLHD
!unzip occupancy_data.zip

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import cross_val_score, learning_curve

In [None]:
df = pd.read_csv('/content/datatest.txt',sep=',')
df2 = pd.read_csv('/content/datatest2.txt',sep=',')
df_training = pd.read_csv('/content/datatraining.txt',sep=',')

# **Let's Explore the Data**

**Data info:**
* date time year-month-day hour:minute:second
* Temperature, in Celsius
* Relative Humidity, %
* Light, in Lux
* CO2, in ppm
* Humidity Ratio, Derived quantity from temperature and relative humidity, in kgwater-vapor/kg-air
* Occupancy, 0 or 1, 0 for not occupied, 1 for occupied status

In [None]:
df.shape, df2.shape , df_training.shape

In [None]:
df.info(),df2.info(), df_training.info()

Let's explore the difference between the given files. Try to understand why is there 3 files and not single!

In [None]:
df.head(10)

In [None]:
df_training.head()

In [None]:
df2.head()

Let's check the timeline of the data

In [None]:
print(f"dataset.txt is within {df.date.min()} - {df.date.max()} with entries : {df.size}")
print(f"datatraining.txt is within {df_training.date.min()} - {df_training.date.max()} with entries : {df.size}")
print(f"dataset2.txt is within {df2.date.min()} - {df2.date.max()} with entries : {df2.size}")

Nothing useful so far!

Looks like these are 1 data splited into 3 parts!
Let's join them all and see what do we have

In [None]:
merged_df = pd.concat([df,df2,df_training]).reset_index(drop=True)
merged_df.shape

In [None]:
merged_df.info()

## **Time Series Analysis**

In [None]:
# fix date type
merged_df['date'] = pd.to_datetime(merged_df['date'])

In [None]:
# Assuming df is your DataFrame and 'date_column' is your datetime column
merged_df['date'] = pd.to_datetime(merged_df['date'])
merged_df['hour'] = merged_df['date'].dt.hour

# Aggregate data
hourly_data = merged_df.groupby(['hour', 'Occupancy']).size().unstack()

# Plot
plt.figure(figsize=(12, 6))
hourly_data.plot(kind='bar', stacked=False, color=['blue', 'orange'], ax=plt.gca())
plt.xlabel('Hour of the Day')
plt.ylabel('Occupancy Sum')  # Adjust label as needed
plt.title('24-Hour Seasonality Pattern')
plt.xticks(range(0, 24))  # Optional: to show all hours
plt.grid(True)
plt.show()

In [None]:
# Assuming merged_df is your DataFrame and 'date' is your datetime column
merged_df['date'] = pd.to_datetime(merged_df['date'])
merged_df['day_of_week'] = merged_df['date'].dt.day_name()  # Extracts the day of the week

# Count the occurrences of each occupancy value for each day
occupancy_count = merged_df.groupby(['day_of_week', 'Occupancy']).size().unstack()

# Reorder days if necessary
order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
occupancy_count = occupancy_count.reindex(order)

# Plot
plt.figure(figsize=(12, 6))
occupancy_count.plot(kind='bar', stacked=False, color=['blue', 'orange'], ax=plt.gca())
plt.xlabel('Day of the Week')
plt.ylabel('Count of Occupancy')
plt.title('Occupancy Count by Day of the Week')
plt.grid(True)
plt.show()


## **Statistical Analysis**

In [None]:
merged_df.describe().T

In [None]:
merged_df.describe(include='object').T

In [None]:

columns_to_plot = ['Temperature', 'Humidity', 'Light', 'CO2', 'HumidityRatio']

# Plotting each feature in a separate subplot
fig, axes = plt.subplots(nrows=1, ncols=5, figsize=(20, 10))
axes = axes.flatten()  # Flatten the array of axes

for i, col in enumerate(columns_to_plot):
    merged_df.boxplot(column=col, ax=axes[i])
    axes[i].set_title(col)

plt.tight_layout()
plt.show()


In [None]:

columns_to_plot = ['Temperature', 'Humidity', 'Light', 'CO2', 'HumidityRatio']

fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(20, 15))  # Adjust grid size as needed
axes = axes.flatten()  # Flatten the array of axes

for i, column in enumerate(columns_to_plot):
    merged_df.boxplot(column=column, by='Occupancy', ax=axes[i])
    axes[i].set_title(f'{column} by Occupancy')
    axes[i].set_xlabel('Occupancy')
    axes[i].set_ylabel(column)

plt.suptitle('Box Plots of Variables by Occupancy')
plt.tight_layout()
plt.show()

Label Distribution

In [None]:
merged_df['Occupancy'].plot(kind='hist', bins=3, rwidth=0.8, density=False)

In [None]:
# Assuming df is your DataFrame and 'Occupancy' is your binary column
label_counts = df['Occupancy'].value_counts()

# Plot
plt.figure(figsize=(8, 8))
plt.pie(label_counts, labels=label_counts.index, autopct='%1.1f%%', startangle=140, colors=['skyblue', 'lightgreen'])
plt.title('Distribution of Binary Label')
plt.axis('equal')  # Equal aspect ratio ensures the pie chart is circular

# Adding a legend for clarity
plt.legend(['Occupied', 'Unoccupied'], title="Labels", loc="best")

plt.show()


In [None]:
merged_df[merged_df.duplicated()].shape

# **Feature Engineering**

In [None]:
# date column not needed anymore
merged_df.drop('date',axis=1,inplace=True)

In [None]:
merged_df.describe()

Feature Encoding and scaling

In [None]:
# encoding day_of_week
day_to_num = {'Monday': 0, 'Tuesday': 1, 'Wednesday': 2, 'Thursday': 3, 'Friday': 4, 'Saturday': 5, 'Sunday': 6}
merged_df['day_of_week'] = merged_df['day_of_week'].map(day_to_num)

In [None]:
# scaling
# Assuming df is your DataFrame
features = ['Temperature', 'Humidity', 'Light', 'CO2', 'HumidityRatio', 'Occupancy',
       'hour']

scaled_df = merged_df.copy()
# Initialize the RobustScaler
scaler = RobustScaler()

# Applying the scaler to the necessary columns in the dataset
scaled_df[features] = scaler.fit_transform(scaled_df[features])

In [None]:
correlation_matrix = scaled_df.corr()

# Plot
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=.5)
plt.title('Correlation Matrix')
plt.show()


We choose to use PCA dim reduction in order to avoid multi collinearity

In [None]:
# Applying PCA
pca = PCA()
pca.fit(scaled_df)

# Calculating the explained variance ratio for each component
explained_variance = pca.explained_variance_ratio_

# Plotting the Cumulative Explained Variance
plt.figure(figsize=(8, 5))
plt.plot(range(1, len(explained_variance) + 1), explained_variance.cumsum(), marker='o', linestyle='--')
plt.title('Explained Variance by Different Principal Components')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.grid(True)
plt.show()

Using Elbow method, we can see 3 would be a good number of PCs for capturing the most variance in the data.
Let's apply PCA to data

In [None]:
# Initialize PCA with the desired number of components
pca = PCA(n_components=3)

# Fit and transform the scaled data
principalComponents = pca.fit_transform(scaled_df)

# Creating a DataFrame with principal components
principalDf = pd.DataFrame(data=principalComponents, columns=['PC'+str(i+1) for i in range(pca.n_components_)])

# adding label back
principalDf['Occupancy'] = scaled_df['Occupancy']

In [None]:
principalDf.head()

# **Training**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import cross_val_score

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [None]:
# Validation splitting
X = principalDf.drop('Occupancy', axis=1)
y = principalDf['Occupancy']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Dictionary of models
# Updated dictionary of models with added regularization parameters
models = {
    'LogisticRegression': LogisticRegression(C=1.0, penalty='l2', solver='liblinear'),  # L2 regularization
    'RandomForest': RandomForestClassifier(max_depth=5),  # Limit the depth of trees
    'GradientBoosting': GradientBoostingClassifier(max_depth=3),  # Limit the depth of trees
    'SVM': SVC(C=1.0),  # Regularization parameter
    'KNN': KNeighborsClassifier(),  # KNN doesn't have regularization but consider reducing k
    'NaiveBayes': GaussianNB()  # Naive Bayes doesn't have regularization
}

# Train and evaluate models
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"{name} classification report:")
    print(classification_report(y_test, y_pred))
    print(f"{name} confusion matrix:")
    print(confusion_matrix(y_test, y_pred))
    print(f"{name} ROC AUC Score:")
    print(roc_auc_score(y_test, y_pred))


    # Cross-validation
    cv_scores = cross_val_score(model, X_train, y_train, cv=5)
    print(f"{name} Cross-validation scores: {cv_scores}")

    # Learning Curve
    train_sizes, train_scores, test_scores = learning_curve(model, X_train, y_train, cv=5)
    train_scores_mean = np.mean(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)

    plt.figure()
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
    plt.title(f"Learning Curve for {name}")
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    plt.legend(loc="best")
    plt.show()
    print("-----" * 10)

In [None]:
# 2. Choose a binary classification algorithm
model = LogisticRegression()

# 3. Train the model
model.fit(X_train, y_train)

# 4. Evaluate the model's performance
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred))

In [None]:
# Preprocessing
# data splitting (train test split)
# prepare inputs(x) and outputs(y)
# Selecting and training model
# RandomForest (best starting candidate)
