In [76]:
# Imports
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.impute import SimpleImputer

In [85]:
# Define the correct datatypes for pandas
dtype_dict = {
    'HCP_ID': 'str',
    'PATIENT_AGE': 'Int64',
    'PATIENT_GENDER': 'category',
    'PATIENT_ID': 'str',
    'STATE': 'category',
    'HCP_SPECIALTY': 'category',
    'HCP_AGE': 'Int64',
    'HCP_GENDER': 'category',
    'STATE_NAME': 'category',
    'TXN_LOCATION_TYPE': 'category',
    'INSURANCE_TYPE': 'category',
    'TXN_TYPE': 'category',
    'TXN_DESC': 'category',
    'LOWCONT_COUNT': 'Int64',
    'MEDCONT_COUNT': 'Int64',
    'HIGHCONT_COUNT': 'Int64',
    'NUM_CONDITIONS': 'Int64',
    'NUM_CONTRAINDICATIONS': 'Int64',
    'NUM_SYMPTOMS': 'Int64',
    'DRUGA_COUNT': 'Int64',
    'TARGET': 'Int64'
}

# Define the path to the CSV file
file_path = '../../data/model_input/ads_model_table.csv'

# Check if the file exists
if not os.path.exists(file_path):
    raise FileNotFoundError(f"The file {file_path} does not exist.")

# Load the CSV file without specifying dtype initially
ads_model_df = pd.read_csv(file_path)

# Convert columns to the specified data types
for col, dtype in dtype_dict.items():
    if col in ads_model_df.columns:
        if dtype == 'category':
            ads_model_df[col] = ads_model_df[col].astype('category')
        elif dtype == 'Int64':
            ads_model_df[col] = pd.to_numeric(ads_model_df[col], errors='coerce').astype('Int64')
        else:
            ads_model_df[col] = ads_model_df[col].astype(dtype)

# Parse dates to datetime
date_columns = ['TXN_DT']
for col in date_columns:
    if col in ads_model_df.columns:
        ads_model_df[col] = pd.to_datetime(ads_model_df[col], errors='coerce')

# Display the DataFrame info
ads_model_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3407 entries, 0 to 3406
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   PATIENT_ID             3407 non-null   object        
 1   PATIENT_GENDER         3407 non-null   category      
 2   NUM_CONDITIONS         3407 non-null   Int64         
 3   TXN_DT                 3407 non-null   datetime64[ns]
 4   HCP_ID                 3407 non-null   object        
 5   TXN_LOCATION_TYPE      3407 non-null   category      
 6   INSURANCE_TYPE         3407 non-null   category      
 7   TXN_TYPE               3407 non-null   category      
 8   TXN_DESC               3407 non-null   category      
 9   RANK                   3407 non-null   int64         
 10  PATIENT_AGE_DIAGNOSED  3407 non-null   int64         
 11  STATE                  3407 non-null   category      
 12  HCP_SPECIALTY          3407 non-null   category      
 13  HCP

In [None]:
# Handling missing values (removing values for non available hcp target list)
#ads_model_df = ads_model_df.dropna(subset=['HCP_ID', 'PATIENT_ID'])
#ads_model_df.info()

# Handle missing values using SimpleImputer
#imputer = SimpleImputer(strategy='most_frequent')
#ads_model_df_imputed = pd.DataFrame(imputer.fit_transform(ads_model_df), columns=ads_model_df.columns)
#ads_model_df_imputed.info()

In [None]:
# Convert object columns back to category
#for col, dtype in dtype_dict.items():
#    if dtype == 'category' and col in ads_model_df_imputed.columns:
#        ads_model_df_imputed[col] = ads_model_df_imputed[col].astype('category')

In [86]:
# Preprocess categorical features
categorical_columns = ads_model_df.select_dtypes(include=['category']).columns
ads_model_df = pd.get_dummies(ads_model_df, columns=categorical_columns, drop_first=True)

In [87]:
# Ensure the dummy variables are integers (1 and 0)
for col in ads_model_df.columns:
    if ads_model_df[col].dtype == bool:
        ads_model_df[col] = ads_model_df[col].astype(int)

In [90]:
ads_model_df.info()
ads_model_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3407 entries, 0 to 3406
Columns: 167 entries, PATIENT_ID to HCP_GENDER_U-Unknown
dtypes: Int64(3), datetime64[ns](1), int64(161), object(2)
memory usage: 4.4+ MB


Unnamed: 0,PATIENT_ID,NUM_CONDITIONS,TXN_DT,HCP_ID,RANK,PATIENT_AGE_DIAGNOSED,DRUGA_COUNT,TARGET,PATIENT_GENDER_M-Male,TXN_LOCATION_TYPE_CLINIC - FREESTANDING,...,HCP_SPECIALTY_SPORTS MEDICINE (EMERGENCY MEDICINE),HCP_SPECIALTY_SPORTS MEDICINE (FAMILY MEDICINE),HCP_SPECIALTY_SPORTS MEDICINE (PEDIATRICS),"HCP_SPECIALTY_STUDENT, HEALTH CARE",HCP_SPECIALTY_THORACIC SURGERY,HCP_SPECIALTY_UNSPECIFIED,HCP_SPECIALTY_UROLOGY,HCP_SPECIALTY_VASCULAR & INTERVENTIONAL RADIOLOGY,HCP_GENDER_M-Male,HCP_GENDER_U-Unknown
0,1,1,2022-06-11,24633,1,34,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
1,2,1,2022-06-22,7777,1,2,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
2,3,1,2022-06-20,17051,1,49,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,4,1,2022-06-30,19478,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,7,1,2022-06-06,8189,1,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [91]:
# Define features and target variable
X = ads_model_df.drop(columns=['PATIENT_ID','HCP_ID', 'TXN_DT', 'TARGET'])
y = ads_model_df['TARGET']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       561
         1.0       1.00      1.00      1.00       121

    accuracy                           1.00       682
   macro avg       1.00      1.00      1.00       682
weighted avg       1.00      1.00      1.00       682



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
