In [None]:
# performing SVM on preprocessed dataset AID1239
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset with dtype handling
file_path = '/content/preprocessed_AID1239_12531Columns_CNN__Active_Inactive.csv'

# Read the dataset while handling potential dtype issues
df = pd.read_csv(file_path, low_memory=False)

# Display first few rows to understand the structure
display(df.head())

# Checking for missing values
display(df.isnull().sum())

# Fill missing values instead of dropping (to retain data)
df.fillna(df.median(numeric_only=True), inplace=True)

# Ensure the target column has only expected values
if 'PUBCHEM_ACTIVITY_OUTCOME' in df.columns:
    df = df[df['PUBCHEM_ACTIVITY_OUTCOME'].isin(['Active', 'Inactive'])]
    df['PUBCHEM_ACTIVITY_OUTCOME'] = df['PUBCHEM_ACTIVITY_OUTCOME'].map({'Active': 1, 'Inactive': 0})
else:
    raise ValueError("Column 'PUBCHEM_ACTIVITY_OUTCOME' not found in dataset")

# Separate features and target
X = df.drop(columns=['PUBCHEM_ACTIVITY_OUTCOME'])
y = df['PUBCHEM_ACTIVITY_OUTCOME']

# Convert categorical columns (if any) using Label Encoding
for col in X.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train the SVM model
svm_model = SVC(kernel='rbf', random_state=42)
svm_model.fit(X_train, y_train)

# Make predictions on both training and test sets
y_train_pred = svm_model.predict(X_train)
y_test_pred = svm_model.predict(X_test)

# Evaluate the model - Calculate accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

# Get classification report for detailed metrics
report_dict = classification_report(y_test, y_test_pred, output_dict=True)

# Extract relevant metrics
precision = report_dict['1']['precision']
recall = report_dict['1']['recall']
f1_score = report_dict['1']['f1-score']
support = report_dict['1']['support']

# Print results
print(f'Train Accuracy: {train_accuracy:.4f}')
print(f'Test Accuracy: {test_accuracy:.4f}')
print(f'Precision (Class 1): {precision:.4f}')
print(f'Recall (Class 1): {recall:.4f}')
print(f'F1 Score (Class 1): {f1_score:.4f}')
print(f'Support (Class 1): {support}')

print('\nClassification Report:')
print(classification_report(y_test, y_test_pred))


In [None]:
# Performing SVM on preprocessed dataset AID1239
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset with dtype handling
file_path = '/content/preprocessed_AID1239_12531Columns_CNN__Active_Inactive.csv'

# Read the dataset while handling potential dtype issues
df = pd.read_csv(file_path, low_memory=False)

# Display first few rows to understand the structure
display(df.head())

# Checking for missing values
display(df.isnull().sum())

# Fill missing values instead of dropping (to retain data)
df.fillna(df.median(numeric_only=True), inplace=True)

# Ensure the target column has only expected values
if 'PUBCHEM_ACTIVITY_OUTCOME' in df.columns:
    df = df[df['PUBCHEM_ACTIVITY_OUTCOME'].isin(['Active', 'Inactive'])]
    df['PUBCHEM_ACTIVITY_OUTCOME'] = df['PUBCHEM_ACTIVITY_OUTCOME'].map({'Active': 1, 'Inactive': 0})
else:
    raise ValueError("Column 'PUBCHEM_ACTIVITY_OUTCOME' not found in dataset")

# Separate features and target
X = df.drop(columns=['PUBCHEM_ACTIVITY_OUTCOME'])
y = df['PUBCHEM_ACTIVITY_OUTCOME']

# Convert categorical columns (if any) using Label Encoding
for col in X.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))

# **Fixing Infinite and Large Values Issue**
# Replace infinite values with NaN
X.replace([np.inf, -np.inf], np.nan, inplace=True)

# Identify columns with NaN values after replacing infinities
print("Columns with NaN values after replacing infinities:")
print(X.isna().sum()[X.isna().sum() > 0])

# Fill NaN values with median
X.fillna(X.median(numeric_only=True), inplace=True)

# Verify if any large values still exist
print("Max values per column:")
print(X.max(numeric_only=True))

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train the SVM model
svm_model = SVC(kernel='rbf', random_state=42)
svm_model.fit(X_train, y_train)

# Make predictions on both training and test sets
y_train_pred = svm_model.predict(X_train)
y_test_pred = svm_model.predict(X_test)

# Evaluate the model - Calculate accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

# Get classification report for detailed metrics
report_dict = classification_report(y_test, y_test_pred, output_dict=True)

# Extract relevant metrics
precision = report_dict['1']['precision']
recall = report_dict['1']['recall']
f1_score = report_dict['1']['f1-score']
support = report_dict['1']['support']

# Print results
print(f'Train Accuracy: {train_accuracy:.4f}')
print(f'Test Accuracy: {test_accuracy:.4f}')
print(f'Precision (Class 1): {precision:.4f}')
print(f'Recall (Class 1): {recall:.4f}')
print(f'F1 Score (Class 1): {f1_score:.4f}')
print(f'Support (Class 1): {support}')

print('\nClassification Report:')
print(classification_report(y_test, y_test_pred))


Unnamed: 0.1,Unnamed: 0,PUBCHEM_CID,PUBCHEM_SID,SMILES,PUBCHEM_ACTIVITY_OUTCOME,SMILES.1,MOLECULEID,autocorr2d0,autocorr2d1,autocorr2d2,...,"atom_pairs_((N,1,2),14,(*,1,0))","atom_pairs_((N,3,0),12,(O,1,1))",morgan_counts_943520092,"atom_pairs_((O,2,0),6,(S,4,0))",morgan_counts_640577968,"atom_pairs_((C,2,2),14,(*,1,0))","bpf_((B,5,0),16,(B,5,0))","atom_pairs_((C,4,0),15,(Cl,1,0))",morgan_counts_469020719,morgan_counts_594640005
0,0,1449342,24817956,C1=CC=C(C=C1)C(=O)NC2=NC=C(C=C2)NC(=O)C3=CC=CS3,Active,O=C(NC1=CN=C(NC(=O)C2=CC=CC=C2)C=C1)C1=CC=CS1,M3503957,3.47,3.799,3.75,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,3242114,4247974,C1COCCN1C(=O)C2=NOC(=C2)C3=CC=C(C=C3)Cl,Active,ClC1=CC=C(C=C1)C1=CC(=NO1)C(=O)N1CCOCC1,M2730310,3.353,3.709,3.697,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,9594900,17507393,CC1=CC(=NC2=CC=CC=C12)N/N=C/C3=CC(=CC=C3)Br,Active,CC1=CC(N\N=C\C2=CC(Br)=CC=C2)=NC2=CC=CC=C12,M3349978,3.462,3.85,3.88,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,823601,7975245,CC1=C(C=CO1)C(=O)NC2=CC3=CC=CC=C3C=C2,Active,CC1=C(C=CO1)C(=O)NC1=CC2=CC=CC=C2C=C1,M2603287,3.196,3.537,3.501,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,3239831,4245352,C1=CC=C(C(=C1)NC(=O)C2=CC=C(C=C2)Cl)O,Active,OC1=CC=CC=C1NC(=O)C1=CC=C(Cl)C=C1,M1281287,3.128,3.483,3.514,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,0
Unnamed: 0,0
PUBCHEM_CID,0
PUBCHEM_SID,0
SMILES,0
PUBCHEM_ACTIVITY_OUTCOME,0
...,...
"atom_pairs_((C,2,2),14,(*,1,0))",0
"bpf_((B,5,0),16,(B,5,0))",0
"atom_pairs_((C,4,0),15,(Cl,1,0))",0
morgan_counts_469020719,0


Columns with NaN values after replacing infinities:
Ipc    1
dtype: int64
Max values per column:
Unnamed: 0                               3999.0
PUBCHEM_CID                         135541112.0
PUBCHEM_SID                          24841703.0
SMILES                                   3998.0
SMILES.1                                 3998.0
                                       ...     
atom_pairs_((C,2,2),14,(*,1,0))             6.0
bpf_((B,5,0),16,(B,5,0))                    8.0
atom_pairs_((C,4,0),15,(Cl,1,0))            3.0
morgan_counts_469020719                     1.0
morgan_counts_594640005                     1.0
Length: 12530, dtype: float64
Train Accuracy: 0.9441
Test Accuracy: 0.8325
Precision (Class 1): 0.8079
Recall (Class 1): 0.8725
F1 Score (Class 1): 0.8389
Support (Class 1): 400.0

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.79      0.83       400
           1       0.81      0.87      0.84       400

    acc

In [None]:
# Performing SVM on preprocessed dataset AID1578
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset with dtype handling
file_path = '/content/preprocessed_AID_1578_12773Columns_Active_Inactive.csv'

# Read the dataset while handling potential dtype issues
df = pd.read_csv(file_path, low_memory=False)

# Display first few rows to understand the structure
display(df.head())

# Checking for missing values
display(df.isnull().sum())

# Fill missing values instead of dropping (to retain data)
df.fillna(df.median(numeric_only=True), inplace=True)

# Ensure the target column has only expected values
if 'PUBCHEM_ACTIVITY_OUTCOME' in df.columns:
    df = df[df['PUBCHEM_ACTIVITY_OUTCOME'].isin(['Active', 'Inactive'])]
    df['PUBCHEM_ACTIVITY_OUTCOME'] = df['PUBCHEM_ACTIVITY_OUTCOME'].map({'Active': 1, 'Inactive': 0})
else:
    raise ValueError("Column 'PUBCHEM_ACTIVITY_OUTCOME' not found in dataset")

# Separate features and target
X = df.drop(columns=['PUBCHEM_ACTIVITY_OUTCOME'])
y = df['PUBCHEM_ACTIVITY_OUTCOME']

# Convert categorical columns (if any) using Label Encoding
for col in X.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))

# **Fixing Infinite and Large Values Issue**
# Replace infinite values with NaN
X.replace([np.inf, -np.inf], np.nan, inplace=True)

# Identify columns with NaN values after replacing infinities
print("Columns with NaN values after replacing infinities:")
print(X.isna().sum()[X.isna().sum() > 0])

# Fill NaN values with median
X.fillna(X.median(numeric_only=True), inplace=True)

# Verify if any large values still exist
print("Max values per column:")
print(X.max(numeric_only=True))

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train the SVM model
svm_model = SVC(kernel='rbf', random_state=42)
svm_model.fit(X_train, y_train)

# Make predictions on both training and test sets
y_train_pred = svm_model.predict(X_train)
y_test_pred = svm_model.predict(X_test)

# Evaluate the model - Calculate accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

# Get classification report for detailed metrics
report_dict = classification_report(y_test, y_test_pred, output_dict=True)

# Extract relevant metrics
precision = report_dict['1']['precision']
recall = report_dict['1']['recall']
f1_score = report_dict['1']['f1-score']
support = report_dict['1']['support']

# Print results
print(f'Train Accuracy: {train_accuracy:.4f}')
print(f'Test Accuracy: {test_accuracy:.4f}')
print(f'Precision (Class 1): {precision:.4f}')
print(f'Recall (Class 1): {recall:.4f}')
print(f'F1 Score (Class 1): {f1_score:.4f}')
print(f'Support (Class 1): {support}')

print('\nClassification Report:')
print(classification_report(y_test, y_test_pred))


Unnamed: 0,SMILES,MOLECULEID,autocorr2d0,autocorr2d1,autocorr2d2,autocorr2d3,autocorr2d4,autocorr2d5,autocorr2d6,autocorr2d7,...,"torsions_((C,3,1),(C,3,1),(C,3,1),(O,1,1))","torsions_((C,4,0),(C,3,1),(C,3,1),(C,3,1))","torsions_((O,1,1),(C,3,1),(C,4,0),(Cl,1,0))","torsions_((C,3,1),(C,3,1),(C,3,1),(N,3,0))","torsions_((C,3,1),(C,3,1),(C,3,1),(N,2,1))",morgan_counts_2834401467,morgan_counts_1492260254,morgan_counts_4255104684,morgan_counts_12835993,PUBCHEM_ACTIVITY_OUTCOME
0,CC1=C(C2=C(C=C(O)C=C2)N1C1=CC=CC=C1)[N+]([O-])=O,M2624390,3.264,3.641,3.719,3.762,3.625,3.42,3.038,2.059,...,0,1,0.0,0,0,0.0,0.0,0.0,0.0,Active
1,CC(=O)C1=CC=C(OC2=C(C=C(C=C2)C(=O)C2=CC(C)=C(C...,M3480558,3.589,3.95,4.06,3.984,3.896,3.903,3.752,3.665,...,6,6,0.0,0,0,0.0,0.0,0.0,0.0,Active
2,ClCC(=O)N(C(C(=O)NC1CCCCC1)C1=CN=CC=C1)C1=CC(C...,M1253430,3.655,4.003,4.144,4.2,4.388,4.487,4.42,4.029,...,0,1,1.0,2,2,0.0,0.0,0.0,0.0,Active
3,Cl.CN(C)C1=CC=C(NC2=NC3=CC=CC=C3C(N)=N2)C=C1,M3516870,3.301,3.64,3.673,3.528,3.289,3.13,3.071,2.973,...,0,0,0.0,6,5,0.0,0.0,0.0,0.0,Active
4,ClCC(=O)C1=CC2=C3N(CCC2)C(=O)CC3=C1,M1206249,3.183,3.572,3.696,3.536,3.292,3.086,2.887,2.218,...,2,6,1.0,2,0,0.0,0.0,0.0,0.0,Active


Unnamed: 0,0
SMILES,0
MOLECULEID,0
autocorr2d0,0
autocorr2d1,0
autocorr2d2,0
...,...
morgan_counts_2834401467,0
morgan_counts_1492260254,0
morgan_counts_4255104684,0
morgan_counts_12835993,0


Columns with NaN values after replacing infinities:
Ipc    2
dtype: int64
Max values per column:
SMILES                                        3998.000
MOLECULEID                                    3997.000
autocorr2d0                                      4.521
autocorr2d1                                      5.037
autocorr2d2                                      5.298
                                                ...   
torsions_((C,3,1),(C,3,1),(C,3,1),(N,2,1))      12.000
morgan_counts_2834401467                         1.000
morgan_counts_1492260254                         1.000
morgan_counts_4255104684                         2.000
morgan_counts_12835993                           1.000
Length: 12772, dtype: float64
Train Accuracy: 0.9025
Test Accuracy: 0.7188
Precision (Class 1): 0.7297
Recall (Class 1): 0.6950
F1 Score (Class 1): 0.7119
Support (Class 1): 400.0

Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.74      

In [None]:
# Performing SVM on preprocessed dataset AID1259354
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset with dtype handling
file_path = '/content/preprocessed_AID_1259354_12825Columns_Active_Inactive_activity.csv'

# Read the dataset while handling potential dtype issues
df = pd.read_csv(file_path, low_memory=False)

# Display first few rows to understand the structure
display(df.head())

# Checking for missing values
display(df.isnull().sum())

# Fill missing values instead of dropping (to retain data)
df.fillna(df.median(numeric_only=True), inplace=True)

# Ensure the target column has only expected values
if 'PUBCHEM_ACTIVITY_OUTCOME' in df.columns:
    df = df[df['PUBCHEM_ACTIVITY_OUTCOME'].isin(['Active', 'Inactive'])]
    df['PUBCHEM_ACTIVITY_OUTCOME'] = df['PUBCHEM_ACTIVITY_OUTCOME'].map({'Active': 1, 'Inactive': 0})
else:
    raise ValueError("Column 'PUBCHEM_ACTIVITY_OUTCOME' not found in dataset")

# Separate features and target
X = df.drop(columns=['PUBCHEM_ACTIVITY_OUTCOME'])
y = df['PUBCHEM_ACTIVITY_OUTCOME']

# Convert categorical columns (if any) using Label Encoding
for col in X.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))

# **Fixing Infinite and Large Values Issue**
# Replace infinite values with NaN
X.replace([np.inf, -np.inf], np.nan, inplace=True)

# Identify columns with NaN values after replacing infinities
print("Columns with NaN values after replacing infinities:")
print(X.isna().sum()[X.isna().sum() > 0])

# Fill NaN values with median
X.fillna(X.median(numeric_only=True), inplace=True)

# Verify if any large values still exist
print("Max values per column:")
print(X.max(numeric_only=True))

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train the SVM model
svm_model = SVC(kernel='rbf', random_state=42)
svm_model.fit(X_train, y_train)

# Make predictions on both training and test sets
y_train_pred = svm_model.predict(X_train)
y_test_pred = svm_model.predict(X_test)

# Evaluate the model - Calculate accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

# Get classification report for detailed metrics
report_dict = classification_report(y_test, y_test_pred, output_dict=True)

# Extract relevant metrics
precision = report_dict['1']['precision']
recall = report_dict['1']['recall']
f1_score = report_dict['1']['f1-score']
support = report_dict['1']['support']

# Print results
print(f'Train Accuracy: {train_accuracy:.4f}')
print(f'Test Accuracy: {test_accuracy:.4f}')
print(f'Precision (Class 1): {precision:.4f}')
print(f'Recall (Class 1): {recall:.4f}')
print(f'F1 Score (Class 1): {f1_score:.4f}')
print(f'Support (Class 1): {support}')

print('\nClassification Report:')
print(classification_report(y_test, y_test_pred))


Unnamed: 0,SMILES,MOLECULEID,PUBCHEM_ACTIVITY_OUTCOME,autocorr2d0,autocorr2d1,autocorr2d2,autocorr2d3,autocorr2d4,autocorr2d5,autocorr2d6,...,morgan_counts_632999367,"atom_pairs_((N,3,1),2,(O,2,0))","bpf_((B,6,0),20,(B,7,0))","torsions_((C,3,1),(N,3,0),(C,4,0),(N,3,0))","atom_pairs_((C,4,0),13,(S,2,1))","torsions_((N,3,0),(C,3,1),(C,3,1),(Cl,1,0))","atom_pairs_((N,3,1),9,(Cl,1,0))",morgan_counts_207885923,"atom_pairs_((N,2,1),10,(Cl,1,0))",morgan_counts_3208857140
0,OC(=O)C1=CC(NC2=CC(=NS(=O)(=O)C3=CC=C(F)C=C3)C...,M83369620,Active,3.877,4.196,4.296,4.257,4.268,4.238,4.158,...,0,0,0,0,0,0,0,0,0,0
1,CC1=C(C#N)C(=O)NC(=O)\C1=C/C1=CC=C(O1)C1=CC(=C...,M17196202,Active,3.498,3.871,3.939,3.924,3.772,3.641,3.437,...,0,0,0,0,0,0,0,0,0,0
2,CCN1\C(C=CC2=CC=CC=C12)=C1\N=C(OC1=O)C1=CC=CC=C1,M14780255,Active,3.44,3.81,3.868,3.848,3.793,3.605,3.388,...,0,0,0,0,0,0,0,0,0,0
3,C1CC(CCN1)NC1=NC2=C(C=CN2)C(=C1)C1=NC=CC=N1,M13249907,Active,3.386,3.726,3.692,3.629,3.639,3.531,3.375,...,0,0,0,0,0,0,0,0,0,0
4,COC1=CC=C(CCNC2=C(N=C(COC3=CC=CC=C3C)O2)C#N)C=C1,M2688527,Active,3.544,3.844,3.862,3.701,3.589,3.629,3.625,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,0
SMILES,0
MOLECULEID,0
PUBCHEM_ACTIVITY_OUTCOME,0
autocorr2d0,0
autocorr2d1,0
...,...
"torsions_((N,3,0),(C,3,1),(C,3,1),(Cl,1,0))",0
"atom_pairs_((N,3,1),9,(Cl,1,0))",0
morgan_counts_207885923,0
"atom_pairs_((N,2,1),10,(Cl,1,0))",0


Columns with NaN values after replacing infinities:
Series([], dtype: int64)
Max values per column:
SMILES                                         3626
MOLECULEID                                     3626
autocorr2d0                                     916
autocorr2d1                                     971
autocorr2d2                                    1059
                                               ... 
torsions_((N,3,0),(C,3,1),(C,3,1),(Cl,1,0))       2
atom_pairs_((N,3,1),9,(Cl,1,0))                   3
morgan_counts_207885923                           4
atom_pairs_((N,2,1),10,(Cl,1,0))                  3
morgan_counts_3208857140                          3
Length: 12825, dtype: int64
Train Accuracy: 0.9136
Test Accuracy: 0.7822
Precision (Class 1): 0.7994
Recall (Class 1): 0.7534
F1 Score (Class 1): 0.7757
Support (Class 1): 365.0

Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.81      0.79       365
           1     