In [10]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.preprocessing.sequence import pad_sequences
from datetime import datetime

test_seq = pd.read_csv('test_sequences.csv')
train_labels = pd.read_csv('train_labels.csv')
train_seq = pd.read_csv('train_sequences.csv')
val_labels = pd.read_csv('validation_labels.csv')
val_seq = pd.read_csv('validation_sequences.csv')

base_encoder = LabelEncoder()
base_encoder.fit(['A', 'U', 'G', 'C', 'N'])

def clean_and_encode_sequence(seq):
    clean_seq = seq.replace("-", "").replace("X", "N") 
    return np.array(base_encoder.transform(list(clean_seq)))  

train_seq["encoded_seq"] = train_seq["sequence"].apply(clean_and_encode_sequence)
train_labels["target_base"] = train_labels["ID"].apply(lambda x: "_".join(x.split("_")[:2]))

merged_data = train_seq.merge(
    train_labels, left_on="target_id", right_on="target_base"
)

merged_data.drop(columns=["target_base"], inplace=True)

merged_data.head()

X = merged_data[['encoded_seq','temporal_cutoff','description']]
y = train_labels[['x_1', 'y_1', 'z_1']]

print(y.shape)
print(X.shape)

X.fillna(0, inplace=True)
y.fillna(0, inplace=True)

def convert_date_to_numeric(date_str):
    """Convert date string to numerical timestamp."""
    return datetime.strptime(date_str, "%Y-%m-%d").timestamp()

# Convert `temporal_cutoff` to numerical values
X['temporal_cutoff'] = X['temporal_cutoff'].apply(convert_date_to_numeric)
# Convert sequences into a fixed-length numerical representation
max_seq_length = max(X['encoded_seq'].apply(len))  # Find longest sequence
X = pad_sequences(X['encoded_seq'], maxlen=max_seq_length, padding='post', dtype='int32')


X_train, X_test, y_train, y_test  = train_test_split(X,y, random_state=0)


model = LinearRegression()
model.fit(X_train, y_train)

prediction  = model.predict(X_test)

print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(model.score(X_test, y_test)))

(137095, 3)
(137095, 3)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y.fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['temporal_cutoff'] = X['temporal_cutoff'].apply(convert_date_to_numeric)


Accuracy of logistic regression classifier on test set: 0.84


In [34]:
test_seq["encoded_seq"] = test_seq["sequence"].apply(clean_and_encode_sequence)
val_labels["target_base"] = val_labels["ID"].apply(lambda x: "_".join(x.split("_")[:2]))
val_labels = val_labels.iloc[:12]

merged_data = test_seq.merge(
    val_labels, left_on="target_id", right_on="target_base"
)

#merged_data.drop(columns=["target_base"], inplace=True)

merged_data.head()

X = test_seq[['encoded_seq','temporal_cutoff','description']]
y = val_labels[['x_1', 'y_1', 'z_1', 'x_2', 'y_2', 'z_2', 'x_3', 'y_3', 'z_3', 'x_4', 'y_4', 'z_4', 'x_5', 'y_5', 'z_5']]

print(y.shape)
print(X.shape)

X.fillna(0, inplace=True)
y.fillna(0, inplace=True)

def convert_date_to_numeric(date_str):
    """Convert date string to numerical timestamp."""
    return datetime.strptime(date_str, "%Y-%m-%d").timestamp()

# Convert `temporal_cutoff` to numerical values
X['temporal_cutoff'] = X['temporal_cutoff'].apply(convert_date_to_numeric)
# Convert sequences into a fixed-length numerical representation
max_seq_length = max(X['encoded_seq'].apply(len))  # Find longest sequence
X = pad_sequences(X['encoded_seq'], maxlen=max_seq_length, padding='post', dtype='int32')


X_train, X_test, y_train, y_test  = train_test_split(X,y, random_state=0)


model = LinearRegression()
model.fit(X_train, y_train)

prediction  = model.predict(X_test)

print(prediction)

print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(model.score(X_test, y_test))) 

# Create a submission file
submission = pd.DataFrame(prediction, columns=['x_1', 'y_1', 'z_1', 'x_2', 'y_2', 'z_2', 'x_3', 'y_3', 'z_3', 'x_4', 'y_4', 'z_4', 'x_5', 'y_5', 'z_5'])
submission.insert(0, 'ID', test_seq['target_id'])  # Add 'ID' column
submission.insert(1, 'resname', val_labels['resname'])  # Add 'resname' column
submission.insert(2, 'resid', val_labels['resid'])  # Add 'resid' column
submission.to_csv('submission.csv', index=False)  # Save with 'ID' as the index label




(12, 15)
(12, 3)
[[-9.90934037e+00  2.66688948e+01  1.09686362e+01 -1.00000000e+18
  -1.00000000e+18 -1.00000000e+18 -1.00000000e+18 -1.00000000e+18
  -1.00000000e+18 -1.00000000e+18 -1.00000000e+18 -1.00000000e+18
  -1.00000000e+18 -1.00000000e+18 -1.00000000e+18]
 [-2.57520008e+01  2.88540001e+01  8.54800034e+00 -1.00000000e+18
  -1.00000000e+18 -1.00000000e+18 -1.00000000e+18 -1.00000000e+18
  -1.00000000e+18 -1.00000000e+18 -1.00000000e+18 -1.00000000e+18
  -1.00000000e+18 -1.00000000e+18 -1.00000000e+18]
 [-5.94777826e+00  2.61793976e+01  2.15013002e+01 -1.00000000e+18
  -1.00000000e+18 -1.00000000e+18 -1.00000000e+18 -1.00000000e+18
  -1.00000000e+18 -1.00000000e+18 -1.00000000e+18 -1.00000000e+18
  -1.00000000e+18 -1.00000000e+18 -1.00000000e+18]]
Accuracy of logistic regression classifier on test set: 0.88


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y.fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['temporal_cutoff'] = X['temporal_cutoff'].apply(convert_date_to_numeric)
