In [18]:
import opensmile
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder


In [4]:

smile = opensmile.Smile(
    feature_set=opensmile.FeatureSet.ComParE_2016,
    feature_level=opensmile.FeatureLevel.Functionals,
)

In [10]:
main_directory = '/Users/jameswang/workspace/genre recognition project/Data/genres_original'

# Initialize an empty list to store the DataFrames
features_list = []

for subdir in os.listdir(main_directory):
    subfolder_path = os.path.join(main_directory, subdir)

    if os.path.isdir(subfolder_path):
        for filename in os.listdir(subfolder_path):
            if filename.endswith('.wav'):  # or your preferred format
                file_path = os.path.join(subfolder_path, filename)

                try:
                    # Extract features for each file
                    features = smile.process_file(file_path)

                    # Reset index of features DataFrame
                    features.reset_index(drop=True, inplace=True)

                    # Create a new DataFrame for 'filename' and 'subfolder' with the same length as features
                    additional_info = pd.DataFrame({'filename': [filename]*len(features), 'subfolder': [subdir]*len(features)})

                    # Concatenate 'additional_info' with 'features'
                    combined_features = pd.concat([additional_info, features], axis=1)

                    # Append the combined DataFrame to the list
                    features_list.append(combined_features)

                except Exception as e:
                    print(f"Error processing file {file_path}: {e}")



# Concatenate all DataFrames in the list into a single DataFrame
all_features = pd.concat(features_list, ignore_index=True)



In [11]:


# Save to CSV
all_features.to_csv('/Users/jameswang/workspace/genre recognition project/features.csv', index=False)

In [21]:
# Load the dataset
file_path = '/Users/jameswang/workspace/genre recognition project/features.csv'  # Replace with your file path
data = pd.read_csv(file_path)
data1 = pd.read_csv(file_path)

# Drop 'filename' column
data = data.drop(columns=['filename'])

# Encode 'subfolder' column if it's the target variable
if data['subfolder'].dtype == 'object':
    le = LabelEncoder()
    data['subfolder'] = le.fit_transform(data['subfolder'])

# Split the dataset into features (X) and target (y)
X = data.drop('subfolder', axis=1)
y = data['subfolder']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize a Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf.fit(X_train, y_train)

# Extract feature importances
feature_importances = rf.feature_importances_

# Create a DataFrame for feature importances
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)


# Assuming you have already trained a RF model and obtained feature importances
top_n_features = 500  # for example, selecting top 500 features
selected_features = feature_importance_df.head(top_n_features)['Feature']

# Create a new dataset with only selected features
reduced_data = data[selected_features]

# Extract 'filename' and 'subfolder' columns from the original dataset
filename_subfolder = data1[['filename', 'subfolder']]

# Concatenating 'filename' and 'subfolder' with the reduced dataset
combined_data = pd.concat([filename_subfolder, reduced_data.reset_index(drop=True)], axis=1)

# Save the combined dataset to a new CSV file
combined_data.to_csv('combined_dataset.csv', index=False)


