# Data mining Project

In [94]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
import pandas as pd
from mlxtend.frequent_patterns import fpgrowth, association_rules
from mlxtend.preprocessing import TransactionEncoder

# Converting .txt to .csv

In [95]:
import pandas as pd

# Read the data from the file
file_path = 'data.txt'
df = pd.read_csv(file_path, delimiter='\t', header=None)

# Assign column names to the DataFrame
columns = [
    "ID", "Gender", "Age_Group", "Residence", "Education_Level", "Source_of_Income",
    "Marital_Status", "Smoked_Cigarettes", "Year_Diagnosed", "Surgical_Treatment",
    "Chemotherapy", "Radiotherapy", "Immunotherapy", "Molecular_targeted_Therapy",
    "Hospitalization_Number", "Time_to_Treatment", "Medical_Treatment_Need",
    "Emotional_Impact", "Travel_Impact", "Quality_of_Life", "Symptoms_exp_cough",
    "Symptoms_exp_Hoarseness", "Symptoms_exp_Blood_cough", "Symptoms_exp_chestpain",
    "Symptoms_exp_Shortness_of_breath", "Symptoms_exp_weakness", "Symptoms_exp_None",
    "Symptom_Frequency", "Symptom_Household_Impact", "Sleep_Issues", "Support_From_Close",
    "Dependency_Fear", "Health_Satisfaction", "Daily_Life_Impact_physical", "Daily_Life_Impact_Psychological",
    "Daily_Life_Impact_proffesional", "Daily_Life_Impact_family_life", "Daily_Life_Impact_social_life",
    "Daily_Life_Impact_no_effect", "Energy_Level", "Self_Care", "Daily_Activities_Difficulty",
    "Work_Readiness", "Support_Satisfaction", "Coping_Strategy", "Negative_Emotions"
]
df.columns = columns

# List of single-choice categorical variables to be one-hot encoded
single_choice_vars = [
    'Gender', 'Age_Group', 'Residence', 'Education_Level', 'Source_of_Income', 'Marital_Status',
    'Hospitalization_Number', 'Time_to_Treatment', 'Medical_Treatment_Need', 'Quality_of_Life',
    'Symptom_Frequency', 'Symptom_Household_Impact', 'Sleep_Issues', 'Dependency_Fear',
    'Energy_Level', 'Self_Care', 'Daily_Activities_Difficulty', 'Work_Readiness',
    'Support_Satisfaction', 'Coping_Strategy', 'Negative_Emotions'
]

# Mapping of variables to their options for meaningful labels
options_dict = {
    'Gender': {1: 'Female', 2: 'Male', 3: 'Nonbinary', 4: 'Prefer_not_to_say'},
    'Age_Group': {1: 'Below_30', 2: '30_45', 3: '46_60', 4: 'Above_60'},
    'Residence': {1: 'Village', 2: 'Town_upto_100k', 3: 'Town_100k_500k', 4: 'City_over_500k'},
    'Education_Level': {1: 'Primary', 2: 'Vocational', 3: 'Secondary', 4: 'Higher'},
    'Source_of_Income': {1: 'Employment', 2: 'Pension', 3: 'Retirement', 4: 'Other'},
    'Marital_Status': {1: 'Single', 2: 'Married', 3: 'Divorced', 4: 'Widowed'},
    'Hospitalization_Number': {1: 'Zero', 2: 'One_to_three', 3: 'Three_to_five', 4: 'Above_five'},
    'Time_to_Treatment': {1: 'Up_to_one_month', 2: 'One_to_three_months', 3: 'More_than_three_months'},
    'Medical_Treatment_Need': {1: 'Not_at_all', 2: 'Moderately', 3: 'Large_extent', 4: 'Very_large_extent'},
    'Quality_of_Life': {1: 'Very_good', 2: 'Good', 3: 'Bad', 4: 'Very_bad'},
    'Symptom_Frequency': {1: 'Did_not_occur', 2: 'Rather_rarely', 3: 'Most_of_time', 4: 'All_the_time'},
    'Symptom_Household_Impact': {1: 'Do_not_affect', 2: 'Rarely_affect', 3: 'Often_affect', 4: 'Always_affect'},
    'Sleep_Issues': {1: 'Not_once', 2: 'Rather_rarely', 3: 'Most_of_time', 4: 'All_the_time'},
    'Dependency_Fear': {1: 'Not_afraid', 2: 'Minimally', 3: 'Large_extent', 4: 'Very_large_extent'},
    'Energy_Level': {1: 'Fully', 2: 'Mostly', 3: 'Very_little', 4: 'Not_at_all'},
    'Self_Care': {1: 'No_problems', 2: 'Minor_problems', 3: 'Serious_problems', 4: 'Cannot_perform'},
    'Daily_Activities_Difficulty': {1: 'No_problems', 2: 'Minor_problems', 3: 'Moderate_problems', 4: 'Serious_problems', 5: 'Unable_to_perform'},
    'Work_Readiness': {1: 'Satisfied', 2: 'Moderately_satisfied', 3: 'Dissatisfied'},
    'Support_Satisfaction': {1: 'Satisfied', 2: 'Moderately_satisfied', 3: 'Dissatisfied'},
    'Coping_Strategy': {1: 'Well', 2: 'Hard_to_say', 3: 'Badly'},
    'Negative_Emotions': {1: 'Never', 2: 'Rarely', 3: 'Often', 4: 'All_the_time'}
}

# Map numerical codes to meaningful labels
for var in single_choice_vars:
    if var in options_dict:
        df[var] = df[var].map(options_dict[var])

# One-hot encode the single-choice categorical variables
df = pd.get_dummies(df, columns=single_choice_vars)

# Save the processed data to a CSV file
df.to_csv('processed_survey_data_columns.csv', index=False)

# Display the first few rows of the processed DataFrame
print(df.head())


   ID  Smoked_Cigarettes  Year_Diagnosed  Surgical_Treatment  Chemotherapy  \
0   1                  1            2021                   0             1   
1   2                  1            2020                   1             1   
2   3                  1            2020                   1             0   
3   4                  1            2021                   1             1   
4   5                  1            2019                   1             1   

   Radiotherapy  Immunotherapy  Molecular_targeted_Therapy  Emotional_Impact  \
0             0              1                           0                 1   
1             0              0                           0                 1   
2             0              0                           0                 1   
3             0              0                           0                 1   
4             0              0                           0                 1   

   Travel_Impact  ...  Support_Satisfaction_Dissat

# Data cleaning

In [96]:
df.shape

(300, 103)

In [97]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Columns: 103 entries, ID to Negative_Emotions_Rarely
dtypes: bool(78), int64(25)
memory usage: 81.6 KB


In [98]:
# 1. Remove duplicate rows
df = df.drop_duplicates()
print(f"\nData after removing duplicates: {df.shape}")

# 2. Handle missing values
numeric_columns = df.select_dtypes(include=['number']).columns
df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].mean())

# Fill missing categorical values with mode
categorical_columns = df.select_dtypes(include=['object']).columns
df[categorical_columns] = df[categorical_columns].apply(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else 'Unknown'))
print(f"\nData after handling missing values:\n{df.isnull().sum()}")

# 3. Drop irrelevant or redundant columns  
if 'ID' in df.columns:
    df = df.drop(columns=['ID'])

# 4. Standardize column names  
df.columns = [col.strip().replace(' ', '_').lower() for col in df.columns]


Data after removing duplicates: (300, 103)

Data after handling missing values:
ID                                0
Smoked_Cigarettes                 0
Year_Diagnosed                    0
Surgical_Treatment                0
Chemotherapy                      0
                                 ..
Coping_Strategy_Well              0
Negative_Emotions_All_the_time    0
Negative_Emotions_Never           0
Negative_Emotions_Often           0
Negative_Emotions_Rarely          0
Length: 103, dtype: int64


**All columns are integer value, numerical columns. But we have categorical columns in them like Gender, Marital Status, Smoked cigaretes, Chemoheraphy etc. ( some are binary, some are multiclass)**

In [99]:
import pandas as pd

 
df = pd.read_csv('processed_survey_data_columns.csv')

 
columns_to_exclude = ['ID', 'Year_Diagnosed']
df_for_itemsets = df.drop(columns=columns_to_exclude)

# Convert all non-zero values to 1 (binary)
df_for_itemsets = df_for_itemsets.applymap(lambda x: 1 if x != 0 else 0)

 
transactions = []
 
for index, row in df_for_itemsets.iterrows():
    # Extract the column names where the value is 1
    itemset = row[row == 1].index.tolist()
    transactions.append(itemset)

# Display the first 5 transactions
for i, transaction in enumerate(transactions[:5]):
    print(f"Transaction {i+1}: {transaction}")


Transaction 1: ['Smoked_Cigarettes', 'Chemotherapy', 'Immunotherapy', 'Emotional_Impact', 'Travel_Impact', 'Symptoms_exp_cough', 'Symptoms_exp_Shortness_of_breath', 'Support_From_Close', 'Daily_Life_Impact_Psychological', 'Gender_Female', 'Age_Group_Below_30', 'Residence_Town_upto_100k', 'Education_Level_Higher', 'Source_of_Income_Employment', 'Marital_Status_Single', 'Hospitalization_Number_Three_to_five', 'Time_to_Treatment_One_to_three_months', 'Medical_Treatment_Need_Large_extent', 'Quality_of_Life_Good', 'Symptom_Frequency_Most_of_time', 'Symptom_Household_Impact_Often_affect', 'Sleep_Issues_Most_of_time', 'Dependency_Fear_Large_extent', 'Energy_Level_Mostly', 'Self_Care_No_problems', 'Daily_Activities_Difficulty_No_problems', 'Work_Readiness_Moderately_satisfied', 'Support_Satisfaction_Moderately_satisfied', 'Coping_Strategy_Badly', 'Negative_Emotions_Often']
Transaction 2: ['Smoked_Cigarettes', 'Surgical_Treatment', 'Chemotherapy', 'Emotional_Impact', 'Travel_Impact', 'Symptoms_

  df_for_itemsets = df_for_itemsets.applymap(lambda x: 1 if x != 0 else 0)


In [100]:
import mlxtend
print("mlxtend version:", mlxtend.__version__)


mlxtend version: 0.23.2


In [101]:
df = pd.read_csv('processed_survey_data_columns.csv')

# List of columns to exclude (not relevant for itemsets)
columns_to_exclude = ['ID', 'Year_Diagnosed']

# Drop the columns to exclude
df_for_itemsets = df.drop(columns=columns_to_exclude)

# Ensure all values are binary (1 or 0)
df_for_itemsets = df_for_itemsets.apply(lambda x: x.apply(lambda y: 1 if y != 0 else 0))
 
transactions = []

for index, row in df_for_itemsets.iterrows():
    # Get the features where the value is 1
    itemset = row[row == 1].index.tolist()
    transactions.append(itemset)

# FP

te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df_fpgrowth = pd.DataFrame(te_ary, columns=te.columns_)

 
frequent_itemsets = fpgrowth(df_fpgrowth, min_support=0.2, use_colnames=True)

 
num_itemsets = len(frequent_itemsets)

 
# rules = association_rules(frequent_itemsets, num_itemsets=num_itemsets, metric="confidence", min_threshold=0.2)

 
print("Frequent Itemsets:")
print(frequent_itemsets.sort_values(by='support', ascending=False))

# # Display the association rules
# print("\nAssociation Rules:")
# print(rules.sort_values(by='lift', ascending=False))

Frequent Itemsets:
       support                                           itemsets
0     0.903333                               (Support_From_Close)
1     0.863333                                 (Emotional_Impact)
65    0.810000             (Support_From_Close, Emotional_Impact)
33    0.776667                   (Support_Satisfaction_Satisfied)
6811  0.776667  (Support_From_Close, Support_Satisfaction_Sati...
...        ...                                                ...
2990  0.200000  (Support_Satisfaction_Satisfied, Symptom_Frequ...
2992  0.200000  (Support_Satisfaction_Satisfied, Health_Satisf...
2993  0.200000  (Support_Satisfaction_Satisfied, Symptom_Frequ...
2994  0.200000  (Support_Satisfaction_Satisfied, Health_Satisf...
9719  0.200000  (Symptom_Frequency_Did_not_occur, Symptoms_exp...

[9720 rows x 2 columns]


## Lowest ST, highest number of items 

In [102]:
print("Frequent Itemsets:") 

for i in frequent_itemsets[-1:]["itemsets"]:
    print(i)

Frequent Itemsets:
frozenset({'Symptom_Frequency_Did_not_occur', 'Symptoms_exp_None', 'Symptom_Household_Impact_Do_not_affect', 'Coping_Strategy_Well', 'Self_Care_No_problems'})


# Relim

In [103]:
#!pip install pymining
from pymining import itemmining

In [104]:
trans_sets = [set(transaction) for transaction in transactions]
 
relim_input = itemmining.get_relim_input(trans_sets)

 
total_transactions = len(transactions)

min_support_ratio = 0.2   
min_support = max(1, int(min_support_ratio * total_transactions))   

# Relim
itemsets = itemmining.relim(relim_input, min_support=min_support)
 
print("Frequent Itemsets:")
for itemset, support in itemsets.items():
    itemset_str = ', '.join(itemset)
    print(f"Itemset: {{{itemset_str}}}, Support: {support}")

Frequent Itemsets:
Itemset: {Self_Care_Minor_problems}, Support: 60
Itemset: {Emotional_Impact, Self_Care_Minor_problems}, Support: 60
Itemset: {Symptom_Frequency_Did_not_occur}, Support: 61
Itemset: {Symptom_Frequency_Did_not_occur, Symptoms_exp_None}, Support: 60
Itemset: {Symptom_Frequency_Did_not_occur, Symptoms_exp_None, Symptom_Household_Impact_Do_not_affect}, Support: 60
Itemset: {Coping_Strategy_Well, Symptom_Frequency_Did_not_occur, Symptoms_exp_None, Symptom_Household_Impact_Do_not_affect}, Support: 60
Itemset: {Coping_Strategy_Well, Symptom_Frequency_Did_not_occur, Symptoms_exp_None, Symptom_Household_Impact_Do_not_affect, Self_Care_No_problems}, Support: 60
Itemset: {Symptom_Frequency_Did_not_occur, Symptoms_exp_None, Symptom_Household_Impact_Do_not_affect, Self_Care_No_problems}, Support: 60
Itemset: {Coping_Strategy_Well, Symptom_Frequency_Did_not_occur, Symptoms_exp_None}, Support: 60
Itemset: {Coping_Strategy_Well, Symptom_Frequency_Did_not_occur, Symptoms_exp_None, Sel