In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load the data
training1_data = pd.read_csv("/kaggle/input/dataset/data/training1.csv")
training2_data = pd.read_csv("/kaggle/input/dataset/data/training2.csv")
test_data = pd.read_csv("/kaggle/input/dataset/data/test.csv")

# Separate features, target, and confidence in training data
X_train1 = training1_data.drop(columns=['label', 'confidence'])
y_train1 = training1_data['label']
confidence1 = training1_data['confidence']

X_train2 = training2_data.drop(columns=['label', 'confidence'])
y_train2 = training2_data['label']
confidence2 = training2_data['confidence']

## Details of Dataset
The extracted files are:
- training1.csv: Contains 400 samples with no missing values.
- training2.csv: Contains 2750 samples with some missing values.
- test.csv: Contains 1000 samples with some missing values.



The training1.csv file contains:
- 400 samples (rows).
- 3458 columns, including:
    - 3072 CNN features (CNNs to CNNs.3071).
    - 384 GIST features (GIST.0 to GIST.383).
    - A label column indicating the class (1 for happy, 0 for sad).
    - A confidence column indicating the confidence of the label.
    - No missing values



The training2.csv file contains:
- 2750 samples (rows).
- 3458 columns, including:
     - 3072 CNN features (CNNs to CNNs.3071).
     - 384 GIST features (GIST.0 to GIST.383).
     - A label column indicating the class (1 for happy, 0 for sad).
     - A confidence column indicating the confidence of the label.
     - This dataset has missing values (NaNs) in the feature columns.


The test.csv file contains:
- 1000 samples (rows).
- 3456 columns, including:
     - 3072 CNN features (CNNs to CNNs.3071).
     - 384 GIST features (GIST.0 to GIST.383).
     - This dataset also has missing values (NaNs) in the feature columns.

In [None]:
X_train2.head()

Unnamed: 0,CNNs,CNNs.1,CNNs.2,CNNs.3,CNNs.4,CNNs.5,CNNs.6,CNNs.7,CNNs.8,CNNs.9,...,GIST.374,GIST.375,GIST.376,GIST.377,GIST.378,GIST.379,GIST.380,GIST.381,GIST.382,GIST.383
0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.33607,1.5884,...,0.00764,,0.036742,0.012381,,0.053308,0.026501,0.005391,0.001272,0.001446
1,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,2.2554,...,0.040871,0.02033,0.043143,0.019345,0.016736,0.008209,0.023059,,,0.022575
2,0.0,,0.0,0.080498,,0.0,0.0,,0.0,0.0,...,0.035165,0.027588,0.039189,0.02731,0.03801,0.003747,0.016547,,0.017964,0.034397
3,0.0,0.0,0.39567,0.0,0.0,0.0,0.0,,0.0,0.0,...,0.04951,0.027773,0.020592,0.044585,0.032217,0.054913,0.035068,0.021064,0.020542,0.033792
4,,,,0.037334,0.0,0.90437,1.17,0.40552,0.0,0.21256,...,0.003357,0.021205,0.003779,0.006411,,0.003991,0.012906,0.008374,0.00219,0.042025


In [None]:
X_train1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Columns: 3456 entries, CNNs to GIST.383
dtypes: float64(3456)
memory usage: 10.5 MB


In [None]:
test_data.head()

Unnamed: 0,CNNs,CNNs.1,CNNs.2,CNNs.3,CNNs.4,CNNs.5,CNNs.6,CNNs.7,CNNs.8,CNNs.9,...,GIST.374,GIST.375,GIST.376,GIST.377,GIST.378,GIST.379,GIST.380,GIST.381,GIST.382,GIST.383
0,,0.2334,0.0,,0.79188,0.0,,0.0,0.439,0.0,...,0.009773,,0.011548,,0.017014,,0.020395,,0.007909,0.024576
1,0.45386,0.0,,,0.0,1.1775,0.0,0.0,0.42297,2.0251,...,0.00982,0.026096,0.039678,,0.057236,0.02344,,0.014737,0.01386,0.058389
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.31986,...,,,0.026954,0.05049,,,,0.021365,0.027606,0.031131
3,0.22014,,0.0,,0.88192,1.0936,,0.0,0.0,0.0,...,0.007899,0.023398,,0.022786,,0.007288,0.043885,,0.011621,0.022733
4,,0.0,,,0.0,0.0,0.0,1.7938,0.0,0.0,...,0.012921,,0.019792,0.01901,0.003771,0.003214,0.001543,,0.003199,


In [None]:
# Define the ranges for CNN and GIST features
cnn_feature_range = range(3072)
gist_feature_range = range(3072, 3456)

# Calculate total values and null values for CNN features
cnn_total_values = X_train2.iloc[:, cnn_feature_range].size
cnn_null_values = X_train2.iloc[:, cnn_feature_range].isnull().sum().sum()

# Calculate total values and null values for GIST features
gist_total_values = X_train2.iloc[:, gist_feature_range].size
gist_null_values = X_train2.iloc[:, gist_feature_range].isnull().sum().sum()

# Calculate total values and null values for all features
total_values = cnn_total_values + gist_total_values
total_null_values = cnn_null_values + gist_null_values

# Calculate the percentage of null values
cnn_null_percentage = (cnn_null_values / cnn_total_values) * 100
gist_null_percentage = (gist_null_values / gist_total_values) * 100
total_null_percentage = (total_null_values / total_values) * 100

print(f"Train Data")
print(f"Total Data values: {total_values}")
print(f"Total null values: {total_null_values}")
print(f"Count of null values in CNN features: {cnn_null_values}")
print(f"Count of null values in GIST features: {gist_null_values}")
print(f"percentage of null values with total data values: {total_null_percentage:.2f}\n")

# Define the ranges for CNN and GIST features
cnn_feature_range = range(3072)
gist_feature_range = range(3072, 3456)

# Calculate total values and null values for CNN features
cnn_total_values = test_data.iloc[:, cnn_feature_range].size
cnn_null_values = test_data.iloc[:, cnn_feature_range].isnull().sum().sum()

# Calculate total values and null values for GIST features
gist_total_values = test_data.iloc[:, gist_feature_range].size
gist_null_values = test_data.iloc[:, gist_feature_range].isnull().sum().sum()

# Calculate total values and null values for all features
total_values = cnn_total_values + gist_total_values
total_null_values = cnn_null_values + gist_null_values

# Calculate the percentage of null values
cnn_null_percentage = (cnn_null_values / cnn_total_values) * 100
gist_null_percentage = (gist_null_values / gist_total_values) * 100
total_null_percentage = (total_null_values / total_values) * 100

print(f"Test Data")
print(f"Total Data values: {total_values}")
print(f"Total null values: {total_null_values}")
print(f"Count of null values in CNN features: {cnn_null_values}")
print(f"Count of null values in GIST features: {gist_null_values}")
print(f"percentage of null values with total data values: {total_null_percentage:.2f}")

In [None]:
# Define the ranges for CNN and GIST features
cnn_feature_range = range(3072)
gist_feature_range = range(3072, 3456)

# Calculate total values and null values for CNN features
cnn_total_values = X_train2.iloc[:, cnn_feature_range].size
cnn_null_values = X_train2.iloc[:, cnn_feature_range].isnull().sum().sum()

# Calculate total values and null values for GIST features
gist_total_values = X_train2.iloc[:, gist_feature_range].size
gist_null_values = X_train2.iloc[:, gist_feature_range].isnull().sum().sum()

# Calculate total values and null values for all features
total_values = cnn_total_values + gist_total_values
total_null_values = cnn_null_values + gist_null_values

# Create a DataFrame for visualization
null_counts_df = pd.DataFrame({
    'Feature Type': ['CNN Features', 'GIST Features', 'Total Null Values', 'Total Data Values'],
    'Values': [cnn_null_values, gist_null_values, total_null_values, total_values - total_null_values],
    'Total Values': [cnn_total_values, gist_total_values, total_values, total_values]
})

# Plot the null values
plt.figure(figsize=(12, 8))
bars = plt.bar(null_counts_df['Feature Type'], null_counts_df['Values'], color=['blue', 'green', 'red', 'purple'])


plt.title('Null Values and Total Data Values in CNN and GIST Features (Training Data)')
plt.ylabel('Count of Values')
plt.xlabel('Feature Type')
plt.show()

In [None]:
# Count total null values
total_null_counts_train = cnn_null_values + gist_null_values

# Create a DataFrame for visualization
null_counts_df = pd.DataFrame({
    'Feature Type': ['CNN Features', 'GIST Features', 'Total Null Values'],
    'Null Values': [cnn_null_values, gist_null_values, total_null_counts_train]
})

# Plot the null values
plt.figure(figsize=(10, 6))
plt.bar(null_counts_df['Feature Type'], null_counts_df['Null Values'], color=['blue', 'green', 'red'])
plt.title('Null Values in CNN and GIST Features (Training Data)')
plt.ylabel('Count of Null Values')
plt.xlabel('Feature Type')
plt.show()

In [None]:
# Combine training datasets
X_train_combined = pd.concat([X_train1, X_train2], axis=0)
y_train_combined = pd.concat([y_train1, y_train2], axis=0)
confidence_combined = pd.concat([confidence1, confidence2], axis=0)

In [None]:
# Handle missing values
imputer = SimpleImputer(strategy='constant',fill_value=0)
X_train_imputed = imputer.fit_transform(X_train_combined)
X_test_imputed = imputer.transform(test_data)