In [None]:
import csv
import numpy as np
from faker import Faker

fake = Faker()

# Create an empty list to store patient data
patients = []

# Generate data for 400,000 patients
for _ in range(400000):
    name = fake.name()
    age = np.random.randint(18, 90)
    gender = fake.random_element(elements=('Male', 'Female'))
    race = fake.random_element(elements=('Caucasian', 'African American', 'Asian', 'Hispanic', 'Other'))
    drug=fake.random_element(elements=('25dph-7.5peh','warfarin (bulk) 100 % powder','chantix','12 hour nasal relief spray ,non-aerosol','wymzya fe','pyrogallol crystals','lyza','lysiplex plus liquid','lysteda','pyrithione zinc shampoo'))
    side_effects = ', '.join(fake.random_elements(elements=('Headache', 'Nausea', 'Fatigue', 'Fever','Drowsiness','Dizziness','dry mouth /nose/throat','upset stomach','constipation','Bloating'), unique=True))

    # Append patient data to the list
    patients.append([name, age, gender, race, drug, side_effects])

# Define the CSV file name
csv_file_name = 'Drug_Effects.csv'

# Write the data to the CSV file
with open(csv_file_name, mode='w', newline='') as csv_file:
    csv_writer = csv.writer(csv_file)
    
    # Write the header row
    csv_writer.writerow(['Name', 'Age', 'Gender', 'Race','Drug', 'Side Effects'])
    
    # Write the patient data
    csv_writer.writerows(patients)

print(f"CSV file '{csv_file_name}' has been created.")


In [None]:
import pandas as pd

In [None]:
csv_file_name='Drug_Effects.csv'
data=pd.read_csv(csv_file_name)

In [None]:
##preview the data
data.head()

In [None]:
##total rows and columns
data.shape

In [None]:
data.info()

In [None]:
##summary statistics
data.describe()

In [None]:
##check datatypes
data.dtypes

In [None]:
##checking missing values
data.isnull().sum()

In [None]:
##dropping duplicate values
data=data.drop_duplicates()
data

In [None]:
data.shape

In [None]:
data.nunique()

In [None]:
data.sort_values(by="Age",ascending=True).head()

In [None]:
data.sort_values(by="Name",ascending=True).head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Example: Create a histogram for a numeric column
sns.histplot(data['Age'], bins=20, kde=True)
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.title('Age Distribution')
plt.show()

In [None]:
# Age distribution
sns.histplot(data['Age'], bins=20, kde=True)
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()


In [None]:
# Gender distribution
sns.countplot(data=data, x='Gender')
plt.title('Gender Distribution')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.show()

In [None]:
##removing outlier using z-score
from scipy import stats
z_scores=stats.zscore(data['Age'])
data=data[z_scores<3]

In [None]:
data

In [None]:
data.shape

In [None]:
correlation_matrix = data.corr()
plt.figure(figsize=(10, 6))
sns.heatmap(data=correlation_matrix, annot=True, cmap='coolwarm', linewidths=.5)
plt.title('Correlation Matrix')
plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
le = LabelEncoder()
data['Gender'] = le.fit_transform(data['Gender'])

data = pd.get_dummies(data, columns=['Race'], prefix='Race')

# 3. Feature Scaling 
scaler = StandardScaler()
numeric_columns=data.select_dtypes(include=['number']).columns
data[numeric_columns] = scaler.fit_transform(data[numeric_columns])

# Split the dataset into features (X) and target (y)
X = data.drop(columns=['Side Effects'])
y = data['Side Effects']

# Split the dataset into training and testing sets (adjust the test_size as needed)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)
