In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load the data
training1_data = pd.read_csv("/kaggle/input/dataset/data/training1.csv")
training2_data = pd.read_csv("/kaggle/input/dataset/data/training2.csv")
test_data = pd.read_csv("/kaggle/input/dataset/data/test.csv")

# Separate features, target, and confidence in training data
X_train1 = training1_data.drop(columns=['label', 'confidence'])
y_train1 = training1_data['label']
confidence1 = training1_data['confidence']

X_train2 = training2_data.drop(columns=['label', 'confidence'])
y_train2 = training2_data['label']
confidence2 = training2_data['confidence']

## Details of Dataset
The extracted files are:
- training1.csv: Contains 400 samples with no missing values.
- training2.csv: Contains 2750 samples with some missing values.
- test.csv: Contains 1000 samples with some missing values.



The training1.csv file contains:
- 400 samples (rows).
- 3458 columns, including:
    - 3072 CNN features (CNNs to CNNs.3071).
    - 384 GIST features (GIST.0 to GIST.383).
    - A label column indicating the class (1 for happy, 0 for sad).
    - A confidence column indicating the confidence of the label.
    - No missing values



The training2.csv file contains:
- 2750 samples (rows).
- 3458 columns, including:
     - 3072 CNN features (CNNs to CNNs.3071).
     - 384 GIST features (GIST.0 to GIST.383).
     - A label column indicating the class (1 for happy, 0 for sad).
     - A confidence column indicating the confidence of the label.
     - This dataset has missing values (NaNs) in the feature columns.


The test.csv file contains:
- 1000 samples (rows).
- 3456 columns, including:
     - 3072 CNN features (CNNs to CNNs.3071).
     - 384 GIST features (GIST.0 to GIST.383).
     - This dataset also has missing values (NaNs) in the feature columns.

In [None]:
X_train2.head()

Unnamed: 0,CNNs,CNNs.1,CNNs.2,CNNs.3,CNNs.4,CNNs.5,CNNs.6,CNNs.7,CNNs.8,CNNs.9,...,GIST.374,GIST.375,GIST.376,GIST.377,GIST.378,GIST.379,GIST.380,GIST.381,GIST.382,GIST.383
0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.33607,1.5884,...,0.00764,,0.036742,0.012381,,0.053308,0.026501,0.005391,0.001272,0.001446
1,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,2.2554,...,0.040871,0.02033,0.043143,0.019345,0.016736,0.008209,0.023059,,,0.022575
2,0.0,,0.0,0.080498,,0.0,0.0,,0.0,0.0,...,0.035165,0.027588,0.039189,0.02731,0.03801,0.003747,0.016547,,0.017964,0.034397
3,0.0,0.0,0.39567,0.0,0.0,0.0,0.0,,0.0,0.0,...,0.04951,0.027773,0.020592,0.044585,0.032217,0.054913,0.035068,0.021064,0.020542,0.033792
4,,,,0.037334,0.0,0.90437,1.17,0.40552,0.0,0.21256,...,0.003357,0.021205,0.003779,0.006411,,0.003991,0.012906,0.008374,0.00219,0.042025


In [None]:
X_train1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Columns: 3456 entries, CNNs to GIST.383
dtypes: float64(3456)
memory usage: 10.5 MB


In [None]:
test_data.head()

Unnamed: 0,CNNs,CNNs.1,CNNs.2,CNNs.3,CNNs.4,CNNs.5,CNNs.6,CNNs.7,CNNs.8,CNNs.9,...,GIST.374,GIST.375,GIST.376,GIST.377,GIST.378,GIST.379,GIST.380,GIST.381,GIST.382,GIST.383
0,,0.2334,0.0,,0.79188,0.0,,0.0,0.439,0.0,...,0.009773,,0.011548,,0.017014,,0.020395,,0.007909,0.024576
1,0.45386,0.0,,,0.0,1.1775,0.0,0.0,0.42297,2.0251,...,0.00982,0.026096,0.039678,,0.057236,0.02344,,0.014737,0.01386,0.058389
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.31986,...,,,0.026954,0.05049,,,,0.021365,0.027606,0.031131
3,0.22014,,0.0,,0.88192,1.0936,,0.0,0.0,0.0,...,0.007899,0.023398,,0.022786,,0.007288,0.043885,,0.011621,0.022733
4,,0.0,,,0.0,0.0,0.0,1.7938,0.0,0.0,...,0.012921,,0.019792,0.01901,0.003771,0.003214,0.001543,,0.003199,


In [None]:
# Define the ranges for CNN and GIST features
cnn_feature_range = range(3072)
gist_feature_range = range(3072, 3456)

# Calculate total values and null values for CNN features
cnn_total_values = X_train2.iloc[:, cnn_feature_range].size
cnn_null_values = X_train2.iloc[:, cnn_feature_range].isnull().sum().sum()

# Calculate total values and null values for GIST features
gist_total_values = X_train2.iloc[:, gist_feature_range].size
gist_null_values = X_train2.iloc[:, gist_feature_range].isnull().sum().sum()

# Calculate total values and null values for all features
total_values = cnn_total_values + gist_total_values
total_null_values = cnn_null_values + gist_null_values

# Calculate the percentage of null values
cnn_null_percentage = (cnn_null_values / cnn_total_values) * 100
gist_null_percentage = (gist_null_values / gist_total_values) * 100
total_null_percentage = (total_null_values / total_values) * 100

print(f"Train Data")
print(f"Total Data values: {total_values}")
print(f"Total null values: {total_null_values}")
print(f"Count of null values in CNN features: {cnn_null_values}")
print(f"Count of null values in GIST features: {gist_null_values}")
print(f"percentage of null values with total data values: {total_null_percentage:.2f}\n")

# Define the ranges for CNN and GIST features
cnn_feature_range = range(3072)
gist_feature_range = range(3072, 3456)

# Calculate total values and null values for CNN features
cnn_total_values = test_data.iloc[:, cnn_feature_range].size
cnn_null_values = test_data.iloc[:, cnn_feature_range].isnull().sum().sum()

# Calculate total values and null values for GIST features
gist_total_values = test_data.iloc[:, gist_feature_range].size
gist_null_values = test_data.iloc[:, gist_feature_range].isnull().sum().sum()

# Calculate total values and null values for all features
total_values = cnn_total_values + gist_total_values
total_null_values = cnn_null_values + gist_null_values

# Calculate the percentage of null values
cnn_null_percentage = (cnn_null_values / cnn_total_values) * 100
gist_null_percentage = (gist_null_values / gist_total_values) * 100
total_null_percentage = (total_null_values / total_values) * 100

print(f"Test Data")
print(f"Total Data values: {total_values}")
print(f"Total null values: {total_null_values}")
print(f"Count of null values in CNN features: {cnn_null_values}")
print(f"Count of null values in GIST features: {gist_null_values}")
print(f"percentage of null values with total data values: {total_null_percentage:.2f}")

In [None]:
# Define the ranges for CNN and GIST features
cnn_feature_range = range(3072)
gist_feature_range = range(3072, 3456)

# Calculate total values and null values for CNN features
cnn_total_values = X_train2.iloc[:, cnn_feature_range].size
cnn_null_values = X_train2.iloc[:, cnn_feature_range].isnull().sum().sum()

# Calculate total values and null values for GIST features
gist_total_values = X_train2.iloc[:, gist_feature_range].size
gist_null_values = X_train2.iloc[:, gist_feature_range].isnull().sum().sum()

# Calculate total values and null values for all features
total_values = cnn_total_values + gist_total_values
total_null_values = cnn_null_values + gist_null_values

# Create a DataFrame for visualization
null_counts_df = pd.DataFrame({
    'Feature Type': ['CNN Features', 'GIST Features', 'Total Null Values', 'Total Data Values'],
    'Values': [cnn_null_values, gist_null_values, total_null_values, total_values - total_null_values],
    'Total Values': [cnn_total_values, gist_total_values, total_values, total_values]
})

# Plot the null values
plt.figure(figsize=(12, 8))
bars = plt.bar(null_counts_df['Feature Type'], null_counts_df['Values'], color=['blue', 'green', 'red', 'purple'])


plt.title('Null Values and Total Data Values in CNN and GIST Features (Training Data)')
plt.ylabel('Count of Values')
plt.xlabel('Feature Type')
plt.show()

In [None]:
# Count total null values
total_null_counts_train = cnn_null_values + gist_null_values

# Create a DataFrame for visualization
null_counts_df = pd.DataFrame({
    'Feature Type': ['CNN Features', 'GIST Features', 'Total Null Values'],
    'Null Values': [cnn_null_values, gist_null_values, total_null_counts_train]
})

# Plot the null values
plt.figure(figsize=(10, 6))
plt.bar(null_counts_df['Feature Type'], null_counts_df['Null Values'], color=['blue', 'green', 'red'])
plt.title('Null Values in CNN and GIST Features (Training Data)')
plt.ylabel('Count of Null Values')
plt.xlabel('Feature Type')
plt.show()

In [None]:
# Combine training datasets
X_train_combined = pd.concat([X_train1, X_train2], axis=0)
y_train_combined = pd.concat([y_train1, y_train2], axis=0)
confidence_combined = pd.concat([confidence1, confidence2], axis=0)

In [None]:
# Handle missing values
imputer = SimpleImputer(strategy='constant',fill_value=0)
X_train_imputed = imputer.fit_transform(X_train_combined)
X_test_imputed = imputer.transform(test_data)

## Scaling Data

StandardScaler standardizes the features by removing the mean and scaling to unit variance. This means each feature will have a mean of 0 and a standard deviation of 1. This is achieved by:

$$
X_{\text{scaled}} = \frac{X - \mu}{\sigma}
$$
 
where 𝜇 is the mean of the feature and 𝜎 is the standard deviation.

In [None]:
# Rescale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

# Model Selection

- Evaluated three different models: 
    - Logistic Regression
    - Random Forest
    - Support Vector Machine (SVM).
- Cross-validation (with 5 folds) was used to assess the performance of each model.
- For each model, Calculated the mean accuracy across all folds.
- Plotted the accuracy for each fold and the mean accuracy for visual comparison.
- This approach helped identify the model with the best overall performance.

In [None]:
# Split the combined training data
X_train, X_val, y_train, y_val, confidence_train, confidence_val = train_test_split(
    X_train_scaled, y_train_combined, confidence_combined, test_size=0.2, random_state=42)

# Define classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC()
}

# Evaluate classifiers using cross-validation with sample weights
results = {}
for clf_name, clf in classifiers.items():
    scores = cross_val_score(clf, X_train_scaled, y_train_combined, cv=5, scoring='accuracy', fit_params={'sample_weight': confidence_combined})
    results[clf_name] = scores
    # Plot the cross-validation results
    title = 'Cross-Validation Accuracy Scores of '+clf_name
    plt.figure(figsize=(10, 6))
    plt.scatter(range(1, 6), scores, color='blue', label='Fold Accuracy')
    plt.scatter([6], [np.mean(scores)], color='red', label='Mean Accuracy')
    plt.axhline(np.mean(scores), color='red', linestyle='--')
    plt.title(title)
    plt.xlabel('Fold')
    plt.ylabel('Accuracy')
    plt.xticks(range(1, 7), ['Fold 1', 'Fold 2', 'Fold 3', 'Fold 4', 'Fold 5', 'Mean'])
    plt.legend()
    plt.grid(True)
    plt.show()
    print(f"scores: {scores}\n {clf_name} Accuracy: {scores.mean():.4f} ± {scores.std():.4f}")

# Convert results to DataFrame for visualization
results_df = pd.DataFrame(results)

# Plot the comparison of different models
plt.figure(figsize=(10, 6))
sns.boxplot(data=results_df, palette="Set2")
plt.title('Comparison of Model Performance')
plt.ylabel('Accuracy')
plt.xlabel('Model')
plt.xticks(rotation=45)
plt.show()

- **Cross-Validation Analysis**
     - Tried 5 different cross-validation (CV) values for each model.
     - Recorded the mean accuracy for each CV value.
     - Plotted a line graph for each model with mean accuracy values as markers on the lines to visualize performance trends across different CV values.

In [None]:

X_train, X_val, y_train, y_val, confidence_train, confidence_val = train_test_split(
    X_train_scaled, y_train_combined, confidence_combined, test_size=0.2, random_state=42)

# Define classifiers with increased max_iter for Logistic Regression and MLP
classifiers = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
}

# Perform cross-validation 5 times for each model and store the mean accuracies
mean_accuracies = {name: [] for name in classifiers.keys()}

for name, clf in classifiers.items():
    for i in range(5):
        scores = cross_val_score(clf, X_train, y_train, cv=3+i, scoring='accuracy')
        mean_accuracies[name].append(np.mean(scores))

# Plot the mean accuracies for each model
plt.figure(figsize=(12, 8))
for name, accuracies in mean_accuracies.items():
    plt.plot(range(1, 6), accuracies, marker='o', label=name)

plt.title('Cross-Validation Mean Accuracy Scores for Different Models')
plt.xlabel('Validation Iteration')
plt.ylabel('Mean Accuracy')
plt.xticks(range(1, 6))
plt.legend()
plt.grid(True)
plt.show()

# Display the mean accuracies
for name, accuracies in mean_accuracies.items():
    print(f"{name}: {accuracies}")


- **Random Forest:** 
    - Evaluated with different numbers of trees (10, 50, 100, 150, 200) using 5-fold cross-validation, showing improved performance with more trees and providing valuable feature importance insights.

In [None]:
# Define the range of number of trees to test
n_estimators_range = [10, 50, 100, 150, 200]

# Initialize lists to store results
train_scores = []
test_scores = []

# Evaluate Random Forest with different numbers of trees
for n in n_estimators_range:
    rf = RandomForestClassifier(n_estimators=n, random_state=42)
    train_score = np.mean(cross_val_score(rf, X_train, y_train, cv=5, scoring='accuracy', fit_params={'sample_weight': confidence_train}))
    test_score = rf.fit(X_train, y_train, sample_weight=confidence_train).score(X_val, y_val)
    train_scores.append(train_score)
    test_scores.append(test_score)

# Plot the results
plt.figure(figsize=(10, 6))
plt.plot(n_estimators_range, train_scores, label="Training score", marker='o')
plt.plot(n_estimators_range, test_scores, label="Validation score", marker='o')
plt.title('Random Forest Performance with Different Numbers of Trees')
plt.xlabel('Number of Trees')
plt.ylabel('Accuracy')
plt.legend()
plt.grid()
plt.show()

## Imputation and Model Training with Different Combinations

- Function Definition:
    - Trainig Data 2 is used for Training and then evalution was done on Training Data 1.
    - The `train_and_evaluate` function handles the imputation, scaling,splitting data, model training, and evaluation.
    - The function parameters allows to specify the imputation strategy, fill value, and whether to use sample weights.
        - Imputation:
            - **Mean Imputation:** Missing values are replaced with the mean of each column.
            - **Constant Value Imputation:** Missing values are replaced with a specified constant value (e.g., 0).
        - Scaling:
            - The data is scaled to have a mean of 0 and a standard deviation of 1 using StandardScaler.
        - Training and Evaluation:

            - **Model Training:** An SVM model is trained with and without sample weights.
            - **Prediction and Evaluation:** The model is used to predict on the validation data, and the classification report is printed.
- Scenario Execution:

    - The code defines a list of scenarios to be executed.
    - For each scenario, the `train_and_evaluate` function is called with the appropriate parameters, and the results are printed.

In [None]:
def train_and_evaluate(strategy, fill_value=None, use_sample_weight=True):
    if strategy == 'constant':
        imputer = SimpleImputer(strategy=strategy, fill_value=fill_value)
    else:
        imputer = SimpleImputer(strategy=strategy)

    # Impute the missing values
    X_train_imputed = imputer.fit_transform(X_train2)
    X_test_imputed = imputer.transform(X_train1)

    # Rescale the data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_imputed)
    X_test_scaled = scaler.transform(X_test_imputed)
    
    X_train, X_val, y_train, y_val, confidence_train, confidence_val = X_train_scaled,X_test_scaled,y_train2,y_train1,confidence2,confidence1

    # Train the model
    best_clf = SVC()
    if use_sample_weight:
        best_clf.fit(X_train_scaled, y_train2, sample_weight=confidence_train)
    else:
        best_clf.fit(X_train_scaled, y_train2)

    # Predict on the validation data
    val_predictions = best_clf.predict(X_test_scaled)
    print(f"Strategy: {strategy}, Fill Value: {fill_value}, Use Sample Weight: {use_sample_weight}")
    print(classification_report(y_train1, val_predictions),"\n")

# Execute the scenarios
scenarios = [
    ('mean', None, True),
    ('mean', None, False),
    ('constant', 0, True),
    ('constant', 0, False)
]

for strategy, fill_value, use_sample_weight in scenarios:
    train_and_evaluate(strategy, fill_value, use_sample_weight)

## Feature Importance
- The dataset includes two types of features: CNN features (3072 features) and GIST features (384 features).
- We used the Random Forest classifier to analyze feature importance because it provides inherent feature importance scores.
- Random Forests are ensemble methods that create multiple decision trees during training and output the mean prediction of individual trees. They are capable of ranking the importance of features.

- **Procedure:**

    - Train the Random Forest model on the combined dataset.
    - Extract feature importance scores from the trained model.
    - Aggregate the importance scores for CNN features and GIST features separately.

In [None]:
# Split the combined training data
X_train, X_val, y_train, y_val, confidence_train, confidence_val = train_test_split(
    X_train_scaled, y_train_combined, confidence_combined, test_size=0.2, random_state=42)

# Train a Random Forest model
rf = RandomForestClassifier(n_estimators=300,random_state=42)
rf.fit(X_train, y_train, sample_weight=confidence_train)

# Get feature importance from the Random Forest model
feature_importances = rf.feature_importances_

# Define the feature ranges for CNN and GIST
cnn_feature_range = range(3072)
gist_feature_range = range(3072, 3456)

# Aggregate feature importances for CNN and GIST features
cnn_importance = np.sum(feature_importances[cnn_feature_range])
gist_importance = np.sum(feature_importances[gist_feature_range])

# Display the aggregated feature importances
print(f"Aggregated feature importance for CNN features: {cnn_importance}")
print(f"Aggregated feature importance for GIST features: {gist_importance}")

# Plot the feature importances
plt.figure(figsize=(10, 6))
plt.bar(['CNN Features', 'GIST Features'], [cnn_importance, gist_importance], color=['blue', 'green'])
plt.title('Aggregated Feature Importance for CNN and GIST Features')
plt.ylabel('Importance')
plt.xlabel('Feature Type')
plt.show()

# Model Training for prediction

- Best Performing Model:
    - After trying out different models with various parameter combinations, SVM emerged as the best-performing model.
    
- Handling Missing Data:
     - Missing data was imputed using a constant value of 0.
     
- Use of Sample Weights:
    - Sample weights were utilized during model training to account for the confidence levels in the training labels.
    
    
In summary, the SVM model with constant value imputation for missing data and the use of sample weights provided the best performance.


In [None]:
imputer = SimpleImputer(strategy='constant',fill_value=0)
X_train_imputed = imputer.fit_transform(X_train_combined)
X_test_imputed = imputer.transform(test_data)

# Rescale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

X_train, X_val, y_train, y_val, confidence_train, confidence_val = train_test_split(
    X_train_scaled, y_train_combined, confidence_combined, test_size=0.1, random_state=42)

# Train the best model on the entire training data with sample weights
best_clf = RandomForestClassifier(random_state=42)
best_clf.fit(X_train, y_train, sample_weight=confidence_train)

# Predict on the test data
predictions = best_clf.predict(X_val)
print(classification_report(y_val, predictions))

# Save the predictions
test_predictions = best_clf.predict(X_test_imputed)
predictions_df = pd.DataFrame({'prediction': test_predictions})
predictions_df.to_csv('predictions.csv', index=False)