In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Load the labels from train_labels.csv
labels_df = pd.read_csv('train_labels.csv')

# Load the CSV files from the train folder
data_list = []
for filename in labels_df['filename']:
    file_path = f'train/{filename}'
    df = pd.read_csv(file_path)
    data_list.append((filename, df))

# Combine data into a single DataFrame
combined_df = pd.DataFrame(data_list, columns=['filename', 'data'])

# Merge with labels on 'filename'
merged_df = pd.merge(combined_df, labels_df, on='filename')

# Split the data into training and testing sets
train_data, test_data, train_labels, test_labels = train_test_split(
    merged_df['data'], merged_df['class'], test_size=0.2, random_state=42
)

# Preprocess the data using TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
train_features = vectorizer.fit_transform(train_data.astype(str))
test_features = vectorizer.transform(test_data.astype(str))

# Build a Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(train_features, train_labels)

# Evaluate the Naive Bayes classifier
nb_predictions = nb_classifier.predict(test_features)
print("Naive Bayes Classification Report:")
print(classification_report(test_labels, nb_predictions))
print("Naive Bayes Accuracy:", accuracy_score(test_labels, nb_predictions))

# Build a neural network classifier
encoder = LabelEncoder()
train_labels_encoded = encoder.fit_transform(train_labels)
num_classes = len(encoder.classes_)

model = Sequential([
    Dense(128, input_shape=(train_features.shape[1],), activation='relu'),
    Dense(num_classes, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the neural network
model.fit(train_features.toarray(), train_labels_encoded, epochs=10, batch_size=32, validation_split=0.1)

# Evaluate the neural network classifier
nn_predictions = model.predict_classes(test_features.toarray())
print("Neural Network Classification Report:")
print(classification_report(test_labels_encoded, nn_predictions))
print("Neural Network Accuracy:", accuracy_score(test_labels_encoded, nn_predictions))


Naive Bayes Classification Report:
              precision    recall  f1-score   support

       anger       0.25      0.07      0.11        14
     disgust       0.22      0.40      0.29        10
        fear       0.14      0.08      0.11        12
         joy       0.00      0.00      0.00        14
     sadness       0.00      0.00      0.00        14
    surprise       0.07      0.38      0.12         8

    accuracy                           0.12        72
   macro avg       0.11      0.15      0.10        72
weighted avg       0.11      0.12      0.09        72

Naive Bayes Accuracy: 0.125
Epoch 1/10


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


AttributeError: 'Sequential' object has no attribute 'predict_classes'

In [None]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Load Data
labels_df = pd.read_csv("train_labels.csv")
data_frames = []

for index, row in labels_df.iterrows():
    filename = row['filename']
    class_label = row['class']
    
    file_path = os.path.join("train", filename)
    csv_data = pd.read_csv(file_path)
    
    csv_data['class'] = class_label
    data_frames.append(csv_data)

# Concatenate all DataFrames into a single DataFrame
merged_data = pd.concat(data_frames, ignore_index=True)


# Step 2: Data Preprocessing
merged_data = merged_data.fillna(merged_data.mean())
# Perform any additional preprocessing steps as needed

# Step 3: Split Data
X = merged_data.drop(columns=['class'])
y = merged_data['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Train a Classifier
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train, y_train)

# Step 5: Evaluate the Model
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print(classification_report(y_test, y_pred))

# Step 6: Prediction for New Data
# Assuming 'new_data.csv' is the new CSV file you want to predict
new_data = pd.read_csv("newdata.csv")
new_predictions = classifier.predict(new_data)

print("Predicted Emotion:", new_predictions)


In [None]:
from statistics import mode

most_frequent_class = mode(new_predictions)
print("Predicted Emotion (Most Frequent):", most_frequent_class)


NameError: name 'new_predictions' is not defined

In [8]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

# Step 1: Load Data
labels_df = pd.read_csv("train_labels.csv")
data_frames = []

for index, row in labels_df.iterrows():
    filename = row['filename']
    class_label = row['class']
    
    file_path = os.path.join("train", filename)
    csv_data = pd.read_csv(file_path)
    
    csv_data['class'] = class_label
    data_frames.append(csv_data)

# Concatenate all DataFrames into a single DataFrame
merged_data = pd.concat(data_frames, ignore_index=True)

# Step 2: Data Preprocessing
# Perform any additional preprocessing steps as needed
# For now, let's encode the 'class' column into numerical values
label_encoder = LabelEncoder()
merged_data['class'] = label_encoder.fit_transform(merged_data['class'])

# Split Data
X = merged_data.drop(columns=['class'])
y = merged_data['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Data preprocessed")
# Train a Classifier
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train, y_train)

# Evaluate the Model
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print(classification_report(y_test, y_pred))

# # Prediction for New Data
# # Assuming 'new_data.csv' is the new CSV file you want to predict
# new_data = pd.read_csv("newdata.csv")
# # Encode the 'class' column in the new data
# new_data['class'] = label_encoder.transform(new_data['class'])
# new_predictions = classifier.predict(new_data.drop(columns=['class']))

# # Decode the numerical predictions back to original labels
# predicted_emotions = label_encoder.inverse_transform(new_predictions)

# print("Predicted Emotion:", predicted_emotions)


Data preprocessed
Accuracy: 0.99
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     54005
           1       0.98      0.98      0.98     54190
           2       0.99      0.99      0.99     54372
           3       1.00      1.00      1.00     53698
           4       1.00      1.00      1.00     53953
           5       1.00      1.00      1.00     53782

    accuracy                           0.99    324000
   macro avg       0.99      0.99      0.99    324000
weighted avg       0.99      0.99      0.99    324000



KeyError: 'class'

In [10]:
# Prediction for New Data
# Assuming 'new_data.csv' is the new CSV file you want to predict
new_data = pd.read_csv("newdata.csv")

# Ensure the new data has the same structure as the training data
new_data.fillna(0, inplace=True)  # Handle missing values, adjust this based on your data
new_data['class'] = classifier.predict(new_data)

# Decode the numerical predictions back to original labels
predicted_emotions = label_encoder.inverse_transform(new_data['class'])

print("Predicted Emotion:", predicted_emotions)

Predicted Emotion: ['sadness' 'sadness' 'sadness' ... 'fear' 'fear' 'fear']


In [12]:

import os
import pandas as pd
from statistics import mode

# Assuming 'model' is the trained classifier and 'merged_data' is the combined training data

# Specify the folder containing the CSV files for prediction
folder_path = "validation"

# Initialize lists to store filenames and predicted classes
file_names = []
predicted_classes = []

# Loop through each CSV file in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith(".csv"):
        # Load the CSV file
        file_path = os.path.join(folder_path, file_name)
        csv_data = pd.read_csv(file_path)
        
        # Make predictions using the trained model
        predictions = classifier.predict(csv_data)
        
        # Get the most frequent class using the mode function
        most_frequent_class = mode(predictions)
        
        # Append filename and predicted class to the lists
        file_names.append(file_name)
        predicted_classes.append(most_frequent_class)

# Create a DataFrame for submission
submission_df = pd.DataFrame({'filename': file_names, 'predicted_class': predicted_classes})

# Save the DataFrame to a new CSV file (submission.csv)
submission_df.to_csv("submission2.csv", index=False)

In [15]:
import os
import pandas as pd
from statistics import mode
from sklearn.preprocessing import LabelEncoder

# Assuming 'classifier' is the trained classifier

# Specify the folder containing the CSV files for prediction
folder_path = "validation"

# Initialize lists to store filenames and predicted classes
file_names = []
predicted_classes = []

# Loop through each CSV file in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith(".csv"):
        # Load the CSV file
        file_path = os.path.join(folder_path, file_name)
        try:
            csv_data = pd.read_csv(file_path)
            
            # Make predictions using the trained model
            predictions = classifier.predict(csv_data)
            
            # Get the most frequent class using the mode function
            most_frequent_class = mode(predictions)
            
            # Append filename and predicted class to the lists
            file_names.append(file_name)
            predicted_classes.append(most_frequent_class)
        except Exception as e:
            print(f"Error processing {file_name}: {e}")

# Reverse the encoded class numbers to original labels
# Assuming 'label_encoder' is the LabelEncoder used during training
original_labels = label_encoder.inverse_transform(predicted_classes)

# Create a DataFrame for submission
submission_df = pd.DataFrame({'filename': file_names, 'predicted_class': original_labels})

# Save the DataFrame to a new CSV file (submission.csv)
submission_df.to_csv("submission2.csv", index=False)


In [17]:
# Step 1: Load Data
labels_df = pd.read_csv("train_labels.csv")
data_frames = []

for index, row in labels_df.iterrows():
    filename = row['filename']
    class_label = row['class']
    
    file_path = os.path.join("train", filename)
    csv_data = pd.read_csv(file_path)
    
    csv_data['class'] = class_label
    data_frames.append(csv_data)

# Concatenate all DataFrames into a single DataFrame
merged_data = pd.concat(data_frames, ignore_index=True)

# Step 2: Data Preprocessing
# Perform any additional preprocessing steps as needed
# For now, let's encode the 'class' column into numerical values
label_encoder = LabelEncoder()
merged_data['class'] = label_encoder.fit_transform(merged_data['class'])
# merged_data.to_csv("merged.csv", index=False)
merged_data.head()

Unnamed: 0,Time,P3,C3,F3,Fz,F4,C4,P4,Cz,CM,...,O2,X3,X2,F7,F8,X1,A2,T6,T4,class
0,93.9033,-585.9,-17.4,-3031.5,414.6,-1579.2,978.6,391.5,-3493.8,-2889.9,...,-72.9,-11.4,-3.6,-873.0,-1057.2,-15.3,-285.0,73.5,-785.4,2
1,93.9067,-590.1,-17.4,-3037.8,418.2,-1574.1,975.9,398.4,-2869.2,-1630.2,...,-69.3,-10.8,-3.6,-874.5,-1046.4,-15.6,-269.1,88.5,-765.9,2
2,93.91,-589.5,-17.1,-3053.4,412.8,-1576.8,981.6,397.2,-2599.2,-741.0,...,-73.5,-11.1,-3.6,-875.1,-1063.2,-15.6,-288.0,78.0,-751.5,2
3,93.9133,-589.5,-17.4,-3048.0,418.8,-1573.2,981.3,392.4,-2857.8,-1482.0,...,-72.9,-10.5,-3.9,-870.9,-1053.9,-15.0,-289.8,77.4,-769.8,2
4,93.9167,-596.4,-17.1,-3044.1,414.6,-1574.7,990.3,396.0,-3071.1,-1704.3,...,-69.6,-10.8,-4.2,-870.9,-1046.7,-15.3,-291.0,89.1,-782.1,2


In [18]:
merged_data.shape

(1620000, 26)

In [1]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

# Step 1: Load Data
labels_df = pd.read_csv("train_labels.csv")
data_frames = []

for index, row in labels_df.iterrows():
    filename = row['filename']
    class_label = row['class']
    
    file_path = os.path.join("train", filename)
    csv_data = pd.read_csv(file_path)
    
    csv_data['class'] = class_label
    data_frames.append(csv_data)

# Concatenate all DataFrames into a single DataFrame
merged_data = pd.concat(data_frames, ignore_index=True)
class_labels = merged_data['class']
merged_data = merged_data.drop(columns=['class'])

# Remove 'Time' column
merged_data = merged_data.drop(columns=['Time'])

# Remove columns with constant values
merged_data = merged_data.loc[:, (merged_data != merged_data.iloc[0]).any()]

# Remove columns with variance less than or equal to 0.001
low_variance_cols = merged_data.var()[merged_data.var() <= 0.001].index
merged_data = merged_data.drop(columns=low_variance_cols)

# Step 2: Data Preprocessing
# Perform any additional preprocessing steps as needed
# For now, let's encode the 'class' column into numerical values
merged_data['class'] = class_labels

label_encoder = LabelEncoder()
merged_data['class'] = label_encoder.fit_transform(merged_data['class'])

# Split Data
X = merged_data.drop(columns=['class'])
y = merged_data['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Data preprocessed")

# Train a Classifier
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train, y_train)

# Evaluate the Model
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print(classification_report(y_test, y_pred))


Data preprocessed
Accuracy: 0.99
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     54005
           1       0.98      0.98      0.98     54190
           2       0.99      0.99      0.99     54372
           3       1.00      1.00      1.00     53698
           4       1.00      1.00      1.00     53953
           5       1.00      1.00      1.00     53782

    accuracy                           0.99    324000
   macro avg       0.99      0.99      0.99    324000
weighted avg       0.99      0.99      0.99    324000



In [5]:
import os
import pandas as pd
from statistics import mode
from sklearn.preprocessing import LabelEncoder

# Assuming 'classifier' is the trained classifier

# Specify the folder containing the CSV files for prediction
folder_path = "validation"

# Initialize lists to store filenames and predicted classes
file_names = []
predicted_classes = []

# Loop through each CSV file in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith(".csv"):
        # Load the CSV file
        file_path = os.path.join(folder_path, file_name)
        try:
            csv_data = pd.read_csv(file_path)
            csv_data=csv_data.drop(columns=['Time'])
            # Make predictions using the trained model
            predictions = classifier.predict(csv_data)
            
            # Get the most frequent class using the mode function
            most_frequent_class = mode(predictions)
            
            # Append filename and predicted class to the lists
            file_names.append(file_name)
            predicted_classes.append(most_frequent_class)
        except Exception as e:
            print(f"Error processing {file_name}: {e}")

# Reverse the encoded class numbers to original labels
# Assuming 'label_encoder' is the LabelEncoder used during training
original_labels = label_encoder.inverse_transform(predicted_classes)

# Create a DataFrame for submission
submission_df = pd.DataFrame({'filename': file_names, 'predicted_class': original_labels})

# Save the DataFrame to a new CSV file (submission.csv)
submission_df.to_csv("submission3.csv", index=False)

In [25]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

# Step 1: Load Data
labels_df = pd.read_csv("train_labels.csv")
data_frames = []

for index, row in labels_df.iterrows():
    filename = row['filename']
    class_label = row['class']
    
    file_path = os.path.join("train", filename)
    csv_data = pd.read_csv(file_path)
    data_frames.append(csv_data)

# Concatenate all DataFrames into a single DataFrame
merged_data = pd.concat(data_frames, ignore_index=True)

# Remove 'class' column temporarily
class_labels = merged_data['class']
merged_data = merged_data.drop(columns=['class'])

# Remove 'Time' column
merged_data = merged_data.drop(columns=['Time'])

# Remove columns with constant values
merged_data = merged_data.loc[:, (merged_data != merged_data.iloc[0]).any()]

# Remove columns with variance less than or equal to 0.001
low_variance_cols = merged_data.var()[merged_data.var() <= 0.001].index
merged_data = merged_data.drop(columns=low_variance_cols)

# Encode the 'class' column into numerical values
label_encoder = LabelEncoder()
class_labels_encoded = label_encoder.fit_transform(class_labels)

# Add the 'class' column back
merged_data['class'] = class_labels_encoded

# Split Data
X = merged_data.drop(columns=['class'])
y = merged_data['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Data preprocessed")

# Train a Classifier
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train, y_train)

# Evaluate the Model
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print(classification_report(y_test, y_pred))


KeyError: 'class'