In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

In [2]:
# Define directories
data_dir = "cancerSeno_bw/test"
output_dir = "featuresTestFromTest"
classes = ["sano", "cancer"]

# Create output directories for unified data
for cls in classes:
    os.makedirs(os.path.join(output_dir, cls, "UnifiedFeatures"), exist_ok=True)

In [3]:
def read_csv_to_df(class_name, feature):
    """Read CSV file into a dataframe."""
    file_path = os.path.join(output_dir, class_name, feature, f"combined_{feature}.csv")
    return pd.read_csv(file_path)

In [4]:
# Initialize dictionaries to store dataframes
data_frames = {cls: {} for cls in classes}

In [5]:
# Read dataframes
for cls in classes:
    data_frames[cls]['pixel_density'] = read_csv_to_df(cls, "pixelDensity")
    data_frames[cls]['brightness'] = read_csv_to_df(cls, "brightness")
    data_frames[cls]['std_deviation'] = read_csv_to_df(cls, "stdDeviation")

# Combine dataframes for each class
for cls in classes:
    combined_df = pd.concat([
        data_frames[cls]['pixel_density'],
        data_frames[cls]['brightness'],
        data_frames[cls]['std_deviation']
    ], axis=1)
    combined_df.columns = ['pixel_density', 'brightness', 'std_deviation']
    combined_df.to_csv(os.path.join(output_dir, cls, "UnifiedFeatures", "combined_features.csv"), index=False)

In [6]:
# Read combined dataframes
combined_sano_df = pd.read_csv(os.path.join(output_dir, "sano", "UnifiedFeatures", "combined_features.csv"))
combined_cancer_df = pd.read_csv(os.path.join(output_dir, "cancer", "UnifiedFeatures", "combined_features.csv"))

# Add diagnosis column
combined_sano_df['diagnosis'] = 0
combined_cancer_df['diagnosis'] = 1

# Concatenate both dataframes
final_df = pd.concat([combined_sano_df, combined_cancer_df], ignore_index=True)

# Drop rows with NaN values
final_df.dropna(inplace=True)

# Save the final dataframe
final_df.to_csv(os.path.join(output_dir, "combined_features_test.csv"), index=False)

print("done")

done


In [8]:
# Load the final combined dataframe
final_df_test = pd.read_csv(os.path.join(output_dir, "combined_features_test.csv"))

In [9]:
final_df_test.head(5)

Unnamed: 0,pixel_density,brightness,std_deviation,diagnosis
0,0.684636,167.8932,72.417016,0
1,0.109632,223.3596,50.889707,0
2,0.260716,171.3472,54.498876,0
3,0.193887,212.4244,57.584104,0
4,0.404494,172.6876,58.721597,0


In [11]:
final_df_test.tail(5)

Unnamed: 0,pixel_density,brightness,std_deviation,diagnosis
47717,0.120072,197.166,49.932419,1
47718,0.47232,133.4136,27.181717,1
47719,1.206531,121.3584,18.580192,1
47720,0.310273,174.9348,55.545104,1
47721,0.308901,170.71,54.364072,1


In [12]:
final_df_test.size

190888

In [13]:
final_df_test.shape

(47722, 4)