In [2]:
import arff
import csv
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder


In [3]:

def convert_arff_to_csv(arff_path, csv_path):
    """
  ARFF TO CSV
    Converts an ARFF file to a CSV file.
    """
    try:
       
        with open(arff_path, 'r', encoding='utf-8') as f:
            arff_data = arff.load(f)

        column_headers = [attr[0] for attr in arff_data['attributes']]
  
        data_rows = arff_data['data']

        with open(csv_path, 'w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(column_headers)
            writer.writerows(data_rows)
        
        print(f"conerted successfully. File saved at:\n{csv_path}")

    except FileNotFoundError:
        print(f"❌ eror : The input file at '{arff_path}' was not found.")
    except Exception as e:
        print(f"❌eror not found: {e}")


# --- How to use ---
# Replace 'your_dataset.arff' with the path to your ARFF file
# and 'converted_dataset.csv' with the desired output CSV file name.
# import functions
input_arff_file = 'your_dataset.arff'
output_csv_file = 'converted_dataset.csv'
convert_arff_to_csv(input_arff_file, output_csv_file)

❌ eror : The input file at 'your_dataset.arff' was not found.


In [4]:

def clean_dataframe(csv_input_path, csv_output_path):
    """
   this function reads a CSV file, preprocesses it, and saves the final file.
    Steps: 
    1. Remove rows with missing values ('?')
    2. Remove completely duplicate columns
    3. Remove completely duplicate rows
    4. Save the cleaned DataFrame to a new CSV file
    """
    try:
        df = pd.read_csv(csv_input_path)

        df.replace('?', np.nan, inplace=True)
        rows_before = df.shape[0]
        df.dropna(inplace=True)
        print(f"{rows_before - df.shape[0]} rows with '?' were removed.")
        print("---")
        cols_before = df.shape[1]
        df = df.T.drop_duplicates().T
        print(f"{cols_before - df.shape[1]} columns were removed.")
        print("---")
        rows_before = df.shape[0]
        df.drop_duplicates(inplace=True)
        print(f"{rows_before - df.shape[0]} rows were removed.")
        print("---")
        
        df.to_csv(csv_output_path, index=False) 
        print(f"✅ Cleaned file saved at:\n{csv_output_path}")

    except FileNotFoundError:
        print(f"❌ eror : The input file at '{csv_input_path}' was not found.")
    except Exception as e:
        print(f"❌ eror not found: {e}")


#-- How to use ---
# Replace 'converted_dataset.csv' with the path to your input CSV file
# and 'preprocessed_dataset.csv' with the desired output CSV file name.
# import functions


input_file = 'converted_dataset.csv' 
output_file = 'preprocessed_dataset.csv'
clean_dataframe(input_file, output_file)

❌ eror : The input file at 'converted_dataset.csv' was not found.


In [5]:



def encode_categorical_features(df: pd.DataFrame) -> pd.DataFrame:
    """
   this function encodes categorical features in the DataFrame.
   - For binary categorical features (2 unique values), it uses Label Encoding.
   - For categorical features with more than 2 unique values, it uses One-Hot Encoding.
   The function returns a new DataFrame with the encoded features.
    """
 
    processed_df = df.copy()
    categorical_columns = processed_df.select_dtypes(include=['object']).columns.tolist()
    
    label_encoder = LabelEncoder()
    onehot_encoder = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')

    print("شروع فرآیند انکودینگ...")
    for col in categorical_columns:
        if processed_df[col].nunique() == 2:
            print(f"column '{col}' is encoded with LabelEncoder.")
            processed_df[col] = label_encoder.fit_transform(processed_df[col])
        elif processed_df[col].nunique() > 2:
            print(f" column '{col}' is encoded with OneHotEncoder.")
            
            encoded_cols_df = pd.DataFrame(
                onehot_encoder.fit_transform(processed_df[[col]]),
                index=processed_df.index 
            )
         
            new_col_names = [f"{col}_{category}" for category in onehot_encoder.categories_[0][1:]]
            encoded_cols_df.columns = new_col_names
            processed_df = pd.concat([processed_df, encoded_cols_df], axis=1)
            processed_df.drop(col, axis=1, inplace=True)
            
    print("encoding process completed.")
    return processed_df






input_csv_path = "cleaned_autism_data.csv"# input file path
output_csv_path = 'encoded_output_file.csv' # output file path

try:

    original_data = pd.read_csv(input_csv_path )
    print(f"file '{input_csv_path}' read successfully.")
    print("-" * 30)

   
    encoded_data = encode_categorical_features(original_data)

    
    encoded_data.to_csv(output_csv_path, index=False)
    print(f" ✅ Encoded file saved at:\n{output_csv_path}")
    

except FileNotFoundError:
    print(f" ❌ eror : The input file at '{input_csv_path}' was not found.")
except Exception as e:
    print(f" ❌ eror not found: {e}")


 ❌ eror : The input file at 'cleaned_autism_data.csv' was not found.
