# Data mining Project

In [11]:
import pandas as pd
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix

# Converting .txt to .csv

In [4]:
file_path = 'data.txt'

df = pd.read_csv(file_path, delimiter='\t', header=None)


columns = [
    "ID","Gender", "Age_Group", "Residence", "Education_Level", "Source_of_Income",
    "Marital_Status", "Smoked_Cigarettes", "Year_Diagnosed", "Surgical_Treatment",
    "Chemotherapy", "Radiotherapy", "Immunotherapy", "Molecular_targeted_Therapy",
    "Hospitalization_Number", "Time_to_Treatment", "Medical_Treatment_Need",
    "Emotional_Impact", "Travel_Impact", "Quality_of_Life", "Symptoms_exp_cough",
    "Symptoms_exp_Hoarseness","Symptoms_exp_Blood_cough","Symptoms_exp_chestpain",
    "Symptoms_exp_Shortness_of_breath","Symptoms_exp_weakness","Symptoms_exp_None",
    "Symptom_Frequency", "Symptom_Household_Impact", "Sleep_Issues", "Support_From_Close",
    "Dependency_Fear", "Health_Satisfaction", "Daily_Life_Impact_physical","Daily_Life_Impact_Psychological",
    "Daily_Life_Impact_proffesional","Daily_Life_Impact_family_life","Daily_Life_Impact_social_life",
    "Daily_Life_Impact_no_effect","Energy_Level", "Self_Care", "Daily_Activities_Difficulty",
    "Work_Readiness", "Support_Satisfaction", "Coping_Strategy", "Negative_Emotions"
]

df.columns = columns
print(df)

df.to_csv('processed_survey_data_columns.csv', index=False)

      ID  Gender  Age_Group  Residence  Education_Level  Source_of_Income  \
0      1       1          1          2                4                 1   
1      2       1          2          4                3                 1   
2      3       1          2          2                4                 1   
3      4       2          4          4                3                 3   
4      5       2          3          1                2                 1   
..   ...     ...        ...        ...              ...               ...   
295  296       1          2          4                3                 1   
296  297       2          3          3                2                 1   
297  298       1          3          2                2                 1   
298  299       2          3          4                3                 1   
299  300       1          2          3                4                 4   

     Marital_Status  Smoked_Cigarettes  Year_Diagnosed  Surgical_Treatment 

# Data cleaning

In [None]:
# 1. Remove duplicate rows
df = df.drop_duplicates()
print(f"\nData after removing duplicates: {df.shape}")

# 2. Handle missing values
numeric_columns = df.select_dtypes(include=['number']).columns
df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].mean())

# Fill missing categorical values with mode
categorical_columns = df.select_dtypes(include=['object']).columns
df[categorical_columns] = df[categorical_columns].apply(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else 'Unknown'))
print(f"\nData after handling missing values:\n{df.isnull().sum()}")

# 3. Drop irrelevant or redundant columns  
if 'ID' in df.columns:
    df = df.drop(columns=['ID'])

# 4. Standardize column names  
df.columns = [col.strip().replace(' ', '_').lower() for col in df.columns]


Data after removing duplicates: (300, 46)

Data after handling missing values:
ID                                  0
Gender                              0
Age_Group                           0
Residence                           0
Education_Level                     0
Source_of_Income                    0
Marital_Status                      0
Smoked_Cigarettes                   0
Year_Diagnosed                      0
Surgical_Treatment                  0
Chemotherapy                        0
Radiotherapy                        0
Immunotherapy                       0
Molecular_targeted_Therapy          0
Hospitalization_Number              0
Time_to_Treatment                   0
Medical_Treatment_Need              0
Emotional_Impact                    0
Travel_Impact                       0
Quality_of_Life                     0
Symptoms_exp_cough                  0
Symptoms_exp_Hoarseness             0
Symptoms_exp_Blood_cough            0
Symptoms_exp_chestpain              0
Symptoms

In [7]:
print(df.head())

   gender  age_group  residence  education_level  source_of_income  \
0       1          1          2                4                 1   
1       1          2          4                3                 1   
2       1          2          2                4                 1   
3       2          4          4                3                 3   
4       2          3          1                2                 1   

   marital_status  smoked_cigarettes  year_diagnosed  surgical_treatment  \
0               1                  1            2021                   0   
1               2                  1            2020                   1   
2               2                  1            2020                   1   
3               2                  1            2021                   1   
4               2                  1            2019                   1   

   chemotherapy  ...  daily_life_impact_family_life  \
0             1  ...                              0   
1           

In [None]:
# Checking for Imbalances
imbalances = {}

for col in categorical_columns:
    counts = df[col].value_counts(normalize=True)
    if any(counts < 0.1):  
        imbalances[col] = counts

for col, counts in imbalances.items():
    print(f"\nColumn: {col}")
    print(counts)

No Imbalance in the dataset

In [15]:
print(df)

     gender  age_group  residence  education_level  source_of_income  \
0         1          1          2                4                 1   
1         1          2          4                3                 1   
2         1          2          2                4                 1   
3         2          4          4                3                 3   
4         2          3          1                2                 1   
..      ...        ...        ...              ...               ...   
295       1          2          4                3                 1   
296       2          3          3                2                 1   
297       1          3          2                2                 1   
298       2          3          4                3                 1   
299       1          2          3                4                 4   

     marital_status  smoked_cigarettes  year_diagnosed  surgical_treatment  \
0                 1                  1            2021   