In [12]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [13]:
!unzip /content/drive/MyDrive/mini-project-1.zip >> /dev/null

replace mini-project-1/datasets/train/train_feature.npz? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import load_model
import joblib  # For loading the model
import pickle  # For loading the tokenizer

from tensorflow.keras.preprocessing.text import Tokenizer


# =========================
# 1. Predictions from SVM Model for Text Sequences
# =========================

# Load the text sequence datasets
test_textseq = pd.read_csv('/content/mini-project-1/datasets/test/test_text_seq.csv')  # Ensure the file name matches

# Convert the text sequence to individual digits (splitting each string into list of integers)
X_test_seq = test_textseq['input_str'].apply(lambda x: [int(char) for char in x]).tolist()
X_test_seq = pd.DataFrame(X_test_seq)

# Load the saved encoder
encoder = joblib.load('/content/mini-project-1/saved-models/onehot_encoder.pkl')

# One-Hot Encoding
X_test_encoded = encoder.transform(X_test_seq)  # Use transform instead of fit_transform

# Load the saved SVM model
loaded_model_seq = joblib.load('/content/mini-project-1/saved-models/best_svm_model.pkl')

# Predict on the test dataset
y_pred_seq = loaded_model_seq.predict(X_test_encoded)

# Save predictions to a text file
with open("pred_text.txt", "w") as f:
    for i, pred in enumerate(y_pred_seq):
        f.write(f"{pred}\n")

print("Text sequence predictions saved to 'pred_text.txt'.")


# =========================
# 2. Predictions from Keras Model for Emoticons
# =========================

# Load the test dataset for emoticons
test_emoticon_df = pd.read_csv("/content/mini-project-1/datasets/test/test_emoticon.csv")
test_emoticon_X = test_emoticon_df['input_emoticon'].tolist()

# Load the saved Keras model
model_emoticon = load_model("/content/mini-project-1/saved-models/dense_model.h5")

# Load the tokenizer
with open("/content/mini-project-1/saved-models/tokenizer.pkl", "rb") as f:
    tokenizer = pickle.load(f)

# Prepare test data for predictions
test_sequences = tokenizer.texts_to_sequences(test_emoticon_X)
max_len = model_emoticon.input_shape[1]  # Get the correct max_len from the model
test_padded = pad_sequences(test_sequences, maxlen=max_len, padding='post')

# Make predictions
predictions_emoticon = (model_emoticon.predict(test_padded) > 0.5).astype(int)  # Assuming binary classification

# Save predictions to a text file
with open("pred_emoticon.txt", "w") as f:
    for emoticon, pred in zip(test_emoticon_X, predictions_emoticon):
        f.write(f"{pred[0]}\n")

print("Emoticon predictions saved to 'pred_emoticon.txt'.")


# =========================
# 3. Predictions from FeatureModel for Features
# =========================

# Load the trained model and scaler for feature data
model_feature = joblib.load("/content/mini-project-1/saved-models/svm_model.joblib")
scaler = joblib.load("/content/mini-project-1/saved-models/scaler.joblib")

# Load the test dataset for features
test_feat_X = np.load("/content/mini-project-1/datasets/test/test_feature.npz", allow_pickle=True)['features']  # Update this path as necessary

# FeatureModel class for prediction
class FeatureModel:
    def __init__(self, model=None, scaler=None):
        self.model = model
        self.scaler = scaler

    def predict(self, X):
        X = X.reshape(X.shape[0], -1)
        X = self.scaler.transform(X)
        return self.model.predict(X)

# Initialize the FeatureModel with the loaded model and scaler
feature_model = FeatureModel(model=model_feature, scaler=scaler)

# Make predictions for test data
predictions_feature = feature_model.predict(test_feat_X)

# Save predictions to a text file
with open("pred_feat.txt", "w") as f:
    for i, pred in enumerate(predictions_feature):
        f.write(f"{pred}\n")

print("Feature data predictions saved to 'pred_feat.txt'.")


# =========================
# 4. Predictions from Combined Model
# =========================

# Load the saved combined model
model_combined = tf.keras.models.load_model('/content/mini-project-1/saved-models/combined_model.h5')

# Load the test data
test_emoticon_df = pd.read_csv('/content/mini-project-1/datasets/test/test_emoticon.csv')
test_embedding_df = pd.read_csv('/content/mini-project-1/datasets/test/test_text_seq.csv')

# Prepare test data
test_emoticon_X = test_emoticon_df['input_emoticon'].tolist()

# Load text data
test_text_X = np.load('/content/mini-project-1/datasets/test/test_feature.npz')['features']

# Convert NumPy arrays to lists of strings
test_text_X = [' '.join(map(str, sublist)) if isinstance(sublist, list) else str(sublist) for sublist in test_text_X]

# Initialize tokenizer with the same setup used during training
train_emoticon_df = pd.read_csv('/content/mini-project-1/datasets/train/train_emoticon.csv')
train_embedding_df = pd.read_csv('/content/mini-project-1/datasets/train/train_text_seq.csv')

train_emoticon_X = train_emoticon_df['input_emoticon'].tolist()
train_text_X = np.load('/content/mini-project-1/datasets/train/train_feature.npz')['features']
train_text_X = [' '.join(map(str, sublist)) if isinstance(sublist, list) else str(sublist) for sublist in train_text_X]

# Initialize tokenizer and fit on training data (emoticons + text)
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(train_emoticon_X + train_text_X)

# Pad sequences for emoticons (using the same max_len_emoticon as during training)
test_emoticon_sequences = tokenizer.texts_to_sequences(test_emoticon_X)
max_len_emoticon = max(len(seq) for seq in tokenizer.texts_to_sequences(train_emoticon_X))  # Use training max length
test_emoticon_padded = pad_sequences(test_emoticon_sequences, maxlen=max_len_emoticon, padding='post')

# Pad sequences for text (using the same max_len_text as during training)
test_text_sequences = tokenizer.texts_to_sequences(test_text_X)
max_len_text = max(len(seq) for seq in tokenizer.texts_to_sequences(train_text_X))  # Use training max length
test_text_padded = pad_sequences(test_text_sequences, maxlen=max_len_text, padding='post')

# Check input shapes to ensure they match
print(f"Test Emoticon Shape: {test_emoticon_padded.shape}, Test Text Shape: {test_text_padded.shape}")

# Make predictions on the test dataset
predictions_combined = model_combined.predict([test_emoticon_padded, test_text_padded])

# Output predictions - using a threshold of 0.5 for binary classification
predicted_labels_combined = [1 if pred >= 0.5 else 0 for pred in predictions_combined]


# Save combined predictions to a text file
with open("pred_combined.txt", "w") as f:
    for pred in predicted_labels_combined:
        f.write(f"{pred}\n")

print("Combined predictions saved to 'pred_combined.txt'.")




Text sequence predictions saved to 'pred_text.txt'.
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Emoticon predictions saved to 'pred_emoticon.txt'.


KeyboardInterrupt: 

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import load_model
import joblib  # For loading the model
import pickle  # For loading the tokenizer

from tensorflow.keras.preprocessing.text import Tokenizer


# =========================
# 1. Predictions from SVM Model for Text Sequences
# =========================

# Load the text sequence datasets
test_textseq = pd.read_csv('/content/mini-project-1/datasets/test/test_text_seq.csv')  # Ensure the file name matches

# Convert the text sequence to individual digits (splitting each string into list of integers)
X_test_seq = test_textseq['input_str'].apply(lambda x: [int(char) for char in x]).tolist()
X_test_seq = pd.DataFrame(X_test_seq)

# Load the saved encoder
encoder = joblib.load('/content/mini-project-1/saved-models/onehot_encoder.pkl')

# One-Hot Encoding
X_test_encoded = encoder.transform(X_test_seq)  # Use transform instead of fit_transform

# Load the saved SVM model
loaded_model_seq = joblib.load('/content/mini-project-1/saved-models/best_svm_model.pkl')

# Predict on the test dataset
y_pred_seq = loaded_model_seq.predict(X_test_encoded)

# Save predictions to a text file
with open("pred_text.txt", "w") as f:
    for i, pred in enumerate(y_pred_seq):
        f.write(f"{pred}\n")

print("Text sequence predictions saved to 'pred_text.txt'.")


# =========================
# 2. Predictions from Keras Model for Emoticons
# =========================

# Load the test dataset for emoticons
test_emoticon_df = pd.read_csv("/content/mini-project-1/datasets/test/test_emoticon.csv")
test_emoticon_X = test_emoticon_df['input_emoticon'].tolist()

# Load the saved Keras model
model_emoticon = load_model("/content/mini-project-1/saved-models/dense_model.h5")

# Load the tokenizer
with open("/content/mini-project-1/saved-models/tokenizer.pkl", "rb") as f:
    tokenizer = pickle.load(f)

# Prepare test data for predictions
test_sequences = tokenizer.texts_to_sequences(test_emoticon_X)
max_len = model_emoticon.input_shape[1]  # Get the correct max_len from the model
test_padded = pad_sequences(test_sequences, maxlen=max_len, padding='post')

# Make predictions
predictions_emoticon = (model_emoticon.predict(test_padded) > 0.5).astype(int)  # Assuming binary classification

# Save predictions to a text file
with open("pred_emoticon.txt", "w") as f:
    for emoticon, pred in zip(test_emoticon_X, predictions_emoticon):
        f.write(f"{pred[0]}\n")

print("Emoticon predictions saved to 'pred_emoticon.txt'.")


# =========================
# 3. Predictions from FeatureModel for Features
# =========================

# Load the trained model and scaler for feature data
model_feature = joblib.load("/content/mini-project-1/saved-models/svm_model.joblib")
scaler = joblib.load("/content/mini-project-1/saved-models/scaler.joblib")

# Load the test dataset for features
test_feat_X = np.load("/content/mini-project-1/datasets/test/test_feature.npz", allow_pickle=True)['features']  # Update this path as necessary

# FeatureModel class for prediction
class FeatureModel:
    def __init__(self, model=None, scaler=None):
        self.model = model
        self.scaler = scaler

    def predict(self, X):
        X = X.reshape(X.shape[0], -1)
        X = self.scaler.transform(X)
        return self.model.predict(X)

# Initialize the FeatureModel with the loaded model and scaler
feature_model = FeatureModel(model=model_feature, scaler=scaler)

# Make predictions for test data
predictions_feature = feature_model.predict(test_feat_X)

# Save predictions to a text file
with open("pred_feat.txt", "w") as f:
    for i, pred in enumerate(predictions_feature):
        f.write(f"{pred}\n")

print("Feature data predictions saved to 'pred_feat.txt'.")


# =========================
# 4. Predictions from Combined Model
# =========================

# Load the saved combined model
model_combined = tf.keras.models.load_model('/content/mini-project-1/saved-models/combined_model.h5')

# Load the test data
test_emoticon_df = pd.read_csv('/content/mini-project-1/datasets/test/test_emoticon.csv')
test_embedding_df = pd.read_csv('/content/mini-project-1/datasets/test/test_text_seq.csv')

# Prepare test data
test_emoticon_X = test_emoticon_df['input_emoticon'].tolist()

# Load text data
test_text_X = np.load('/content/mini-project-1/datasets/test/test_feature.npz')['features']

# Convert NumPy arrays to lists of strings
test_text_X = [' '.join(map(str, sublist)) if isinstance(sublist, list) else str(sublist) for sublist in test_text_X]

# Initialize tokenizer with the same setup used during training
train_emoticon_df = pd.read_csv('/content/mini-project-1/datasets/train/train_emoticon.csv')
train_embedding_df = pd.read_csv('/content/mini-project-1/datasets/train/train_text_seq.csv')

train_emoticon_X = train_emoticon_df['input_emoticon'].tolist()
train_text_X = np.load('/content/mini-project-1/datasets/train/train_feature.npz')['features']
train_text_X = [' '.join(map(str, sublist)) if isinstance(sublist, list) else str(sublist) for sublist in train_text_X]

# Initialize tokenizer and fit on training data (emoticons + text)
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(train_emoticon_X + train_text_X)

# Pad sequences for emoticons (using the same max_len_emoticon as during training)
test_emoticon_sequences = tokenizer.texts_to_sequences(test_emoticon_X)
max_len_emoticon = max(len(seq) for seq in tokenizer.texts_to_sequences(train_emoticon_X))  # Use training max length
test_emoticon_padded = pad_sequences(test_emoticon_sequences, maxlen=max_len_emoticon, padding='post')

# Pad sequences for text (using the same max_len_text as during training)
test_text_sequences = tokenizer.texts_to_sequences(test_text_X)
max_len_text = max(len(seq) for seq in tokenizer.texts_to_sequences(train_text_X))  # Use training max length
test_text_padded = pad_sequences(test_text_sequences, maxlen=max_len_text, padding='post')

# Check input shapes to ensure they match
print(f"Test Emoticon Shape: {test_emoticon_padded.shape}, Test Text Shape: {test_text_padded.shape}")

# Make predictions on the test dataset
predictions_combined = model_combined.predict([test_emoticon_padded, test_text_padded])

# Output predictions - using a threshold of 0.5 for binary classification
predicted_labels_combined = [1 if pred >= 0.5 else 0 for pred in predictions_combined]


# Save combined predictions to a text file
with open("pred_combined.txt", "w") as f:
    for pred in predicted_labels_combined:
        f.write(f"{pred}\n")

print("Combined predictions saved to 'pred_combined.txt'.")




Text sequence predictions saved to 'pred_text.txt'.
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Emoticon predictions saved to 'pred_emoticon.txt'.
Feature data predictions saved to 'pred_feat.txt'.


NameError: name 'max_len_emoticon' is not defined

In [14]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import load_model
import joblib  # For loading the model
import pickle  # For loading the tokenizer

from tensorflow.keras.preprocessing.text import Tokenizer


# =========================
# 1. Predictions from SVM Model for Text Sequences
# =========================

# Load the text sequence datasets
test_textseq = pd.read_csv('/content/mini-project-1/datasets/test/test_text_seq.csv')  # Ensure the file name matches

# Convert the text sequence to individual digits (splitting each string into list of integers)
X_test_seq = test_textseq['input_str'].apply(lambda x: [int(char) for char in x]).tolist()
X_test_seq = pd.DataFrame(X_test_seq)

# Load the saved encoder
encoder = joblib.load('/content/mini-project-1/saved-models/onehot_encoder.pkl')

# One-Hot Encoding
X_test_encoded = encoder.transform(X_test_seq)  # Use transform instead of fit_transform

# Load the saved SVM model
loaded_model_seq = joblib.load('/content/mini-project-1/saved-models/best_svm_model.pkl')

# Predict on the test dataset
y_pred_seq = loaded_model_seq.predict(X_test_encoded)

# Save predictions to a text file
with open("pred_text.txt", "w") as f:
    for i, pred in enumerate(y_pred_seq):
        f.write(f"{pred}\n")

print("Text sequence predictions saved to 'pred_text.txt'.")


# =========================
# 2. Predictions from Keras Model for Emoticons
# =========================

# Load the test dataset for emoticons
test_emoticon_df = pd.read_csv("/content/mini-project-1/datasets/test/test_emoticon.csv")
test_emoticon_X = test_emoticon_df['input_emoticon'].tolist()

# Load the saved Keras model
model_emoticon = load_model("/content/mini-project-1/saved-models/dense_model.h5")

# Load the tokenizer
with open("/content/mini-project-1/saved-models/tokenizer.pkl", "rb") as f:
    tokenizer = pickle.load(f)

# Prepare test data for predictions
test_sequences = tokenizer.texts_to_sequences(test_emoticon_X)
max_len = model_emoticon.input_shape[1]  # Get the correct max_len from the model
test_padded = pad_sequences(test_sequences, maxlen=max_len, padding='post')

# Make predictions
predictions_emoticon = (model_emoticon.predict(test_padded) > 0.5).astype(int)  # Assuming binary classification

# Save predictions to a text file
with open("pred_emoticon.txt", "w") as f:
    for emoticon, pred in zip(test_emoticon_X, predictions_emoticon):
        f.write(f"{pred[0]}\n")

print("Emoticon predictions saved to 'pred_emoticon.txt'.")


# =========================
# 3. Predictions from FeatureModel for Features
# =========================

# Load the trained model and scaler for feature data
model_feature = joblib.load("/content/mini-project-1/saved-models/svm_model.joblib")
scaler = joblib.load("/content/mini-project-1/saved-models/scaler.joblib")

# Load the test dataset for features
test_feat_X = np.load("/content/mini-project-1/datasets/test/test_feature.npz", allow_pickle=True)['features']  # Update this path as necessary

# FeatureModel class for prediction
class FeatureModel:
    def __init__(self, model=None, scaler=None):
        self.model = model
        self.scaler = scaler

    def predict(self, X):
        X = X.reshape(X.shape[0], -1)
        X = self.scaler.transform(X)
        return self.model.predict(X)

# Initialize the FeatureModel with the loaded model and scaler
feature_model = FeatureModel(model=model_feature, scaler=scaler)

# Make predictions for test data
predictions_feature = feature_model.predict(test_feat_X)

# Save predictions to a text file
with open("pred_feat.txt", "w") as f:
    for i, pred in enumerate(predictions_feature):
        f.write(f"{pred}\n")

print("Feature data predictions saved to 'pred_feat.txt'.")


# =========================
# 4. Predictions from Combined Model
# =========================

# Load the saved combined model
model_combined = tf.keras.models.load_model('/content/mini-project-1/saved-models/combined_model.h5')

# Load the test data
test_emoticon_df = pd.read_csv('/content/mini-project-1/datasets/test/test_emoticon.csv')
test_embedding_df = pd.read_csv('/content/mini-project-1/datasets/test/test_text_seq.csv')

# Prepare test data
test_emoticon_X = test_emoticon_df['input_emoticon'].tolist()

# Load text data
test_text_X = np.load('/content/mini-project-1/datasets/test/test_feature.npz')['features']

# Convert NumPy arrays to lists of strings
test_text_X = [' '.join(map(str, sublist)) if isinstance(sublist, list) else str(sublist) for sublist in test_text_X]

# Initialize tokenizer with the same setup used during training
train_emoticon_df = pd.read_csv('/content/mini-project-1/datasets/train/train_emoticon.csv')
train_embedding_df = pd.read_csv('/content/mini-project-1/datasets/train/train_text_seq.csv')

train_emoticon_X = train_emoticon_df['input_emoticon'].tolist()
train_text_X = np.load('/content/mini-project-1/datasets/train/train_feature.npz')['features']
train_text_X = [' '.join(map(str, sublist)) if isinstance(sublist, list) else str(sublist) for sublist in train_text_X]

# Initialize tokenizer and fit on training data (emoticons + text)
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(train_emoticon_X + train_text_X)

# Pad sequences for emoticons (using the same max_len_emoticon as during training)
test_emoticon_sequences = tokenizer.texts_to_sequences(test_emoticon_X)
max_len_emoticon = max(len(seq) for seq in tokenizer.texts_to_sequences(train_emoticon_X))  # Use training max length
test_emoticon_padded = pad_sequences(test_emoticon_sequences, maxlen=max_len_emoticon, padding='post')

# Pad sequences for text (using the same max_len_text as during training)
test_text_sequences = tokenizer.texts_to_sequences(test_text_X)
max_len_text = max(len(seq) for seq in tokenizer.texts_to_sequences(train_text_X))  # Use training max length
test_text_padded = pad_sequences(test_text_sequences, maxlen=max_len_text, padding='post')

# Check input shapes to ensure they match
print(f"Test Emoticon Shape: {test_emoticon_padded.shape}, Test Text Shape: {test_text_padded.shape}")

# Make predictions on the test dataset
predictions_combined = model_combined.predict([test_emoticon_padded, test_text_padded])

# Output predictions - using a threshold of 0.5 for binary classification
predicted_labels_combined = [1 if pred >= 0.5 else 0 for pred in predictions_combined]


# Save combined predictions to a text file
with open("pred_combined.txt", "w") as f:
    for pred in predicted_labels_combined:
        f.write(f"{pred}\n")

print("Combined predictions saved to 'pred_combined.txt'.")




Text sequence predictions saved to 'pred_text.txt'.
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Emoticon predictions saved to 'pred_emoticon.txt'.
Feature data predictions saved to 'pred_feat.txt'.




Test Emoticon Shape: (2232, 13), Test Text Shape: (2232, 635)
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 113ms/step
Combined predictions saved to 'pred_combined.txt'.
