In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest, f_classif

In [2]:
data = pd.read_csv('Doceree-HCP_Train.csv', encoding='latin1')

FileNotFoundError: [Errno 2] No such file or directory: 'Doceree-HCP_Train.csv'

In [None]:
data.head()
data

In [65]:
# Check for NaN values in the entire DataFrame
nan_counts = data.isnull().sum()

# Print the column-wise counts of NaN values
print(nan_counts)

ID                     0
DEVICETYPE             0
PLATFORM_ID            0
BIDREQUESTIP           0
USERPLATFORMUID        4
USERCITY            6359
USERZIPCODE         4592
USERAGENT              2
PLATFORMTYPE           0
CHANNELTYPE            0
URL                    0
KEYWORDS               0
TAXONOMY           81624
IS_HCP                 1
dtype: int64


In [66]:
# Fill NaN values with a specific value
data_filled = data.fillna(0)  # Replace NaN with 0

# Check for NaN values in the entire DataFrame
nan_counts = data_filled.isnull().sum()

# Print the column-wise counts of NaN values
print(nan_counts)

# Print the number of rows
num_rows = data.shape[0]
print("Number of rows:", num_rows)

data_filled.head()

ID                 0
DEVICETYPE         0
PLATFORM_ID        0
BIDREQUESTIP       0
USERPLATFORMUID    0
USERCITY           0
USERZIPCODE        0
USERAGENT          0
PLATFORMTYPE       0
CHANNELTYPE        0
URL                0
KEYWORDS           0
TAXONOMY           0
IS_HCP             0
dtype: int64
Number of rows: 113937


Unnamed: 0,ID,DEVICETYPE,PLATFORM_ID,BIDREQUESTIP,USERPLATFORMUID,USERCITY,USERZIPCODE,USERAGENT,PLATFORMTYPE,CHANNELTYPE,URL,KEYWORDS,TAXONOMY,IS_HCP
0,1001,Desktop,2,170.173.0.22,6974dcaa-f932-480e-9fb5-c52e20e1393a,Portland,97206.0,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,Online Medical Journal,Website,https://www.cancertherapyadvisor.com/home/canc...,General|Endocrine|False|Medicine|Surgery|Urolo...,0,0.0
1,1002,Desktop,2,65.216.253.25,c12f3f8f-8fcf-484a-90e1-1ac04db8cdcf,Arlington,22202.0,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7...,Online Medical Journal,Website,https://www.cancertherapyadvisor.com/home/deci...,Bone Marrow|Radiography|Chronic|Oncology|Psych...,0,0.0
2,1003,Desktop,2,66.232.79.22,a698de4b-e200-46dd-b5fb-40402175ae18,New Meadows,83654.0,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7...,Online Medical Journal,Website,https://www.cancertherapyadvisor.com/home/canc...,General|Endocrine|False|Medicine|Surgery|Urolo...,0,0.0
3,1004,Desktop,3,137.54.125.246,45967533-75c8-4fbd-a00c-e6ff20447aaa,0,229114624.0,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,Online Medical Journal,Website,https://globalrph.com/medcalcs/warfarin-mainte...,Dental|Total|Clinical|Pharmacology|Physicians|...,2084P0800X,1.0
4,1005,Mobile,7,174.202.231.99,a17e25be-532d-4cf5-b916-9308c8c3961f,Houston,77008.0,Mozilla/5.0 (iPhone; CPU iPhone OS 16_1_1 like...,Online Medical Journal,Website,https://www.cureus.com/articles/58184-a-review...,Critical Care|Emergency Medicine|General Pract...,0,0.0


In [67]:
# Separate the target variable
X = data_filled.drop(['IS_HCP'], axis=1)
print(data_filled['IS_HCP'])

0         0.0
1         0.0
2         0.0
3         1.0
4         0.0
         ... 
113932    1.0
113933    1.0
113934    1.0
113935    1.0
113936    1.0
Name: IS_HCP, Length: 113937, dtype: float64


In [68]:
# Preprocess numeric columns
numeric_columns = ['PLATFORM_ID', 'USERZIPCODE']
X[numeric_columns] = X[numeric_columns].astype(float)  # Convert to float if needed

In [69]:
# Preprocess string columns
string_columns = ['DEVICETYPE', 'BIDREQUESTIP', 'USERPLATFORMUID', 'USERCITY', 'USERAGENT', 'PLATFORMTYPE', 'CHANNELTYPE', 'URL', 'KEYWORDS', 'TAXONOMY']
for column in string_columns:
    if column in X.columns:
        X[column] = LabelEncoder().fit_transform(X[column].astype(str))

In [70]:
# Encode categorical columns using sparse one-hot encoding
categorical_columns = ['DEVICETYPE', 'USERAGENT', 'PLATFORMTYPE', 'CHANNELTYPE']
onehot_encoder = OneHotEncoder(sparse=True, drop='first')
X_encoded = onehot_encoder.fit_transform(X[categorical_columns])



In [71]:
# Create feature names for one-hot encoded columns
feature_names = []
for i, column in enumerate(categorical_columns):
    categories = onehot_encoder.categories_[i][1:]  # Exclude the first category
    feature_names.extend([f"{column}_{category}" for category in categories])


In [72]:
# Create a DataFrame with the encoded features
X_encoded_df = pd.DataFrame.sparse.from_spmatrix(X_encoded, columns=feature_names)

In [73]:
# Drop the original categorical columns from X
X.drop(categorical_columns, axis=1, inplace=True)

In [74]:
# Concatenate the original and encoded features
X = pd.concat([X, X_encoded_df], axis=1)

In [75]:
# Split the data into training and testing sets
print(data_filled['IS_HCP'])
y=data_filled['IS_HCP'].astype(bool)
print(data_filled['IS_HCP'].unique())
# Encode target variable as boolean
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

0         0.0
1         0.0
2         0.0
3         1.0
4         0.0
         ... 
113932    1.0
113933    1.0
113934    1.0
113935    1.0
113936    1.0
Name: IS_HCP, Length: 113937, dtype: float64
[0. 1.]


In [78]:

# Define the subsample size
subsample_size = 1000

# Create a subsample of the training data
indices = np.random.choice(range(X_train.shape[0]), subsample_size, replace=False)
X_train_subsample = X_train.iloc[indices]
y_train_subsample = y_train[indices]

# Instantiate and fit the logistic regression model for feature selection
estimator = LogisticRegression(penalty='l1', solver='saga', max_iter=1000)
estimator.fit(X_train_subsample, y_train_subsample)

# Instantiate the feature selection method
selector = SelectKBest(score_func=f_classif, k=10)

# Fit the selector on the training data
selector.fit(X_train_subsample, y_train_subsample)

# Transform the training and testing data using the selector
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)

# Train the Random Forest classifier
model = RandomForestClassifier()
model.fit(X_train_selected, y_train)

# Predict on the test set
y_pred = model.predict(X_test_selected)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

  f = msb / msw


Accuracy: 0.9995611725469545
