## 1. Setup and Data Loading

In [1]:
import string
import re
import numpy as np
import pandas as pd
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the training and test datasets
train_df = pd.read_csv("/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv")
test_df = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')

### Data Preprocessing

In [2]:
def remove_non_english_chars(text):
    # Define the range of Unicode characters for English and some punctuation
    en_chars = set(string.ascii_letters + string.digits + string.punctuation + ' ')
    
    # Use a regular expression to match all non-English characters and emojis
    non_en_pattern = re.compile(f'[^{" ".join(en_chars)}]')

    # Substitute all non-English characters and emojis with an empty string
    cleaned_text = non_en_pattern.sub('', text)
    
    return cleaned_text


In [3]:
seed = 202

def seed_everything(seed=202):
    import random
    random.seed(seed)
    np.random.seed(seed)

seed_everything(seed)

In [4]:
## credit @siddhvr
not_persuade_df = train_df[train_df['source'] != 'persuade_corpus']
persuade_df = train_df[train_df['source'] == 'persuade_corpus']
sampled_persuade_df = persuade_df.sample(n=6000, random_state=42)

# Testing idea from discussion with @nbroad about limited characters in human essays
all_human = set(list(''.join(sampled_persuade_df.text.to_list())))
other = set(list(''.join(not_persuade_df.text.to_list())))
chars_to_remove = ''.join([x for x in other if x not in all_human])
print(chars_to_remove)

translation_table = str.maketrans('', '', chars_to_remove)

def remove_chars(s):
    return s.translate(translation_table)

😮🕵😜手📅😕机🐟÷こ道必🙀一📧🛸🏽🌐🍭🤞ち🌨上🎸🧚🥑💚”🌞👋禁止集🙄í🎊响👯👍🍗🌏🍖�🤷🥯💅🔑に🎄💬は🏞💔🐳🐾😭🐰🌧📄🐸🛣影🚀🧐🇺🚔🇵🤣🍜🏜🍓🗣🎨👦💀🎮🎵📱♂す🌳🍕🎹п🚕🍴🌯🏨📷り🏰🎣💨с🌻🏻🤜🥘😠中👨📖🏼🍲💆📉🏔👂🎓🧭😘😎🥗🕺🇧🐭💤🚫👕📹🎠🌎🏏🙃💉“📦🎩🍎🐦🐴🥩□力🥲д💃司😒有ç😃🤩🌱🎾🥤🌽🙅🕹📊🥟о🌴🐒–‍🐧🏳


In [5]:
# Apply a function for removing target chars
train_df['text'] = train_df['text'].apply(remove_chars)

In [6]:
# Sample additional data if needed
additional_data = train_df[train_df['label'] == 1].sample(8000)
train_df = train_df[train_df.RDizzl3_seven == True]
train_df = pd.concat([train_df, additional_data])

# Clean and concatenate the text from both datasets
train_df['text'] = train_df['text'].str.replace('\n', '')
test_df['text'] = test_df['text'].str.replace('\n', '')
combined_text = pd.concat([train_df['text'], test_df['text']], axis=0)

## 2. Feature Engineering

In [7]:
%%time

# Initialize TfidfVectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True,
                             ngram_range=(3, 4), # So far, ngram_range=(3, 5) reached the best score
                             tokenizer=lambda x: re.findall(r'[^\W]+', x),
                             token_pattern=None,
                             strip_accents='unicode',
                             )

# # Fit the vectorizer on the combined text data
# vectorizer.fit(combined_text)

# Fit the vectorizer on the test text data
vectorizer.fit(test_df['text'])

# Transform the text data into features
X_train = vectorizer.transform(train_df['text'])
X_test = vectorizer.transform(test_df['text'])


CPU times: user 26.5 s, sys: 3.89 ms, total: 26.5 s
Wall time: 26.5 s


## 3. Model Training with Ensemble Learning

In [8]:
%%time

# Define the ensemble model with multiple classifiers
mnb_model1 = MultinomialNB(alpha=0.02)
lr_model1 = LogisticRegression()
sgd_model1 = SGDClassifier(max_iter=10000, tol=1e-3, loss="modified_huber")   
sgd_model2 = SGDClassifier(max_iter=10000, tol=5e-4, loss="modified_huber", class_weight="balanced") 
sgd_model3 = SGDClassifier(max_iter=15000, tol=3e-4, loss="modified_huber", early_stopping=True)

# Combine classifiers into a VotingClassifier
ensemble = VotingClassifier(estimators=[
    ('mnb1', mnb_model1),
    ('lr', lr_model1),
    ('sgd1', sgd_model1), 
    ('sgd2', sgd_model2),
    ('sgd3', sgd_model3)
],  weights=[0.2, 0.2, 0.2, 0.2, 0.2], voting='soft')

# Train the ensemble model
ensemble.fit(X_train, train_df['label'])


CPU times: user 100 ms, sys: 923 µs, total: 101 ms
Wall time: 112 ms


## 4. Prediction and Submission File Creation

In [9]:
# Generate predictions for the test set
predictions = ensemble.predict_proba(X_test)[:, 1]

# Create a DataFrame with the test IDs and their corresponding predicted probabilities
submission_df = pd.DataFrame({
    'id': test_df['id'],
    'generated': predictions
})

# Save the submission file
submission_df.to_csv('/kaggle/working/submission.csv', index=False)


In [10]:
submission_df

Unnamed: 0,id,generated
0,0000aaaa,0.498832
1,1111bbbb,0.498832
2,2222cccc,0.498832


## 5. Grid Search for Hyperparameter Tuning(Optional)

In [11]:
# from sklearn.metrics import make_scorer, roc_auc_score
# from sklearn.model_selection import GridSearchCV

# # Apply GridSearch
# isGridSearch = True  
# if isGridSearch:
#     param_grid = {'weights': [(0.3, 0.4, 0.3)]}  # Example weights, expand as needed
#     scorer = make_scorer(roc_auc_score)

#     grid_search = GridSearchCV(estimator=ensemble, param_grid=param_grid, scoring=scorer, cv=7)
#     grid_search.fit(X_train, train_df['label'])

#     # Get the best weights
#     best_weights = grid_search.best_params_['weights']
#     print(f"Best Weights: {best_weights}")

#     # Use the best weights to make predictions
#     ensemble.set_params(weights=best_weights)
#     ensemble.fit(X_train, train_df['label'])
#     predictions = ensemble.predict_proba(X_test)[:, 1]
    
#     # Update submission DataFrame and save again
#     submission_df = pd.DataFrame({
#     'id': test_df['id'],
#     'generated': predictions
# })
    
#     # Save the submission file
#     submission_df.to_csv('/kaggle/working/submission.csv', index=False)


In [12]:
# submission_df