In [1]:
import pandas as pd

def loadData(path): 
	return pd.read_csv(path)

In [2]:
import nltk
nltk.download('punkt')

from nltk.tokenize import word_tokenize

def flattenText(text): 
	words = word_tokenize(text.lower())
	return ' '.join(words)

print(flattenText('Hello world! How #are you?!'))

hello world ! how # are you ? !


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Sauron\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:

import re

def extractTags(text):
	hashtag_pattern = r"#\w+"
	return re.findall(hashtag_pattern, text)
	
#print(extractTags('Hello world! How #are you?!'))

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
import numpy as np

def textVectorizer(sentences):
	vectorizer = TfidfVectorizer()
	vectorizer.fit_transform(sentences)
	return vectorizer

def tagEncoder(labels):
	label_encoder = MultiLabelBinarizer()
	label_encoder.fit(labels)
	return label_encoder

def keywordEncoder(keywords):
	keyword_encoder = MultiLabelBinarizer()
	keyword_encoder.fit(keywords)
	return keyword_encoder


In [5]:
import nlpaug.augmenter.word as naw

# Data augmentation using synonym replacement
aug = naw.SynonymAug()

def augment_text(text):
	return ". ".join(aug.augment(text))




In [6]:
import pandas as pd
import numpy as np

def copy_row(rows):
	rows['text'] = rows['text'].apply(augment_text)
	return rows

np.random.seed(42)

def expand_dataset(data, target_size):
	# Calculate the current count of each target value
	target_counts = data['target'].value_counts()

	# Get the target values with counts less than the target size
	targets_to_expand = target_counts[target_counts < target_size].index.tolist()

	# Create a list to store the expanded data
	expanded_data = []

	# Iterate through each target value to expand
	for target_value in targets_to_expand:
		# Get the rows with the current target value
		target_rows = data[data['target'] == target_value]

		# Calculate how many rows to copy for the current target value
		rows_to_copy = target_size - len(target_rows)

		# Randomly select rows from the current target value to copy
		rows_to_copy_indices = np.random.choice(target_rows.index, rows_to_copy)

		#for index in rows_to_copy_indices:
		#	copy = copy_row(target_rows.loc[index])
		#	expanded_data.append(copy)
		expanded_data.append(copy_row(target_rows.loc[rows_to_copy_indices]))
	
	# Concatenate the original data and the expanded data
	expanded_data_df = pd.concat([data] + expanded_data, ignore_index=True)
	expanded_data_df.reset_index(drop=True, inplace=True)
	return expanded_data_df

# Sample DataFrame
""" data = pd.DataFrame({
	'text': ['hello', 'world', 'how are you', 'what\'s your name', 'hi'],
	'target': [0, 0, 0, 1, 1],
})

target_size = 5

# Call the function to expand the dataset
expanded_df = expand_dataset(data, target_size)

print(expanded_df) """


" data = pd.DataFrame({\n\t'text': ['hello', 'world', 'how are you', 'what's your name', 'hi'],\n\t'target': [0, 0, 0, 1, 1],\n})\n\ntarget_size = 5\n\n# Call the function to expand the dataset\nexpanded_df = expand_dataset(data, target_size)\n\nprint(expanded_df) "

In [7]:
data = loadData('train.csv')
data = data.fillna('')
data = expand_dataset(data, 5000)
data['hashtags'] = data['text'].apply(extractTags)
data['normalized_text'] = data['text'].apply(flattenText)
target = data['target']
print(target[target == 1].count().sum() / target.size)
print(data.shape)

0.5
(10000, 7)


In [8]:
vectorizer = textVectorizer(data['normalized_text'])
hashtags = tagEncoder(data['hashtags'])
keywords = keywordEncoder(np.array(data['keyword']).reshape(-1, 1))


In [9]:
from scipy.sparse import vstack, hstack

def rowsToVectors(rows):
	return hstack([
		vectorizer.transform(rows['normalized_text']), 
		hashtags.transform(rows['hashtags']),
		keywords.transform([[keyword] for keyword in rows['keyword']])
	])


In [10]:
from scipy.sparse import vstack, hstack
#print(hashtags.transform(data['hashtags']).shape)
#print(vectorizer.transform(data['normalized_text']).shape)
#print(np.array(keywords.transform([[keyword] for keyword in data['keyword']])).shape)
#train_data = hstack([
#	vectorizer.transform(data['normalized_text']), 
#	hashtags.transform(data['hashtags']),
#	keywords.transform([[keyword] for keyword in data['keyword']])
#])
train_data = rowsToVectors(data)
train_data.shape

(10000, 26024)

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_data, data['target'], test_size=0.2, random_state=42)


In [12]:

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Step 4: Model Selection & Step 5: Model Training
regrModel = LogisticRegression(max_iter=10000)  # You can choose another model if needed
regrModel.fit(X_train, y_train)

# Step 6: Model Evaluation
y_pred = regrModel.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.813


In [13]:
import numpy as np
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

xgb_model = xgb.XGBClassifier()

xgb_model.fit(X_train, y_train)

# Step 5: Make predictions on the testing data
y_pred = xgb_model.predict(X_test)

# Step 6: Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Optionally, you can also print the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.799
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.84      0.81      1043
           1       0.81      0.75      0.78       957

    accuracy                           0.80      2000
   macro avg       0.80      0.80      0.80      2000
weighted avg       0.80      0.80      0.80      2000



In [15]:
X_train2, X_val, y_train2, y_val = train_test_split(X_train.toarray(), y_train, test_size=0.2, random_state=42)
X_train2.shape

(6400, 26024)

In [24]:
import numpy as np
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

X = X_train2
y = y_train2
# Assuming you have the following data:
# X_train: One-hot encoded input training data (shape: [num_samples, input_size])
# y_train: Binary labels for training data (shape: [num_samples,])

# Define the neural network architecture
neural_model = Sequential()
neural_model.add(Dense(128, activation='relu', input_shape=(X.shape[1],)))
neural_model.add(Dropout(0.2))
neural_model.add(Dense(64, activation='relu'))
neural_model.add(Dropout(0.2))
neural_model.add(Dense(1, activation='sigmoid'))  # Output layer with sigmoid activation for binary classification

# Compile the model
neural_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Define early stopping and model checkpoint callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True)

# Train the model
history = neural_model.fit(X, y, epochs=30, batch_size=64, validation_data=(X_val, y_val), callbacks=[early_stopping, checkpoint])

# Evaluate the model on the test set
loss, accuracy = neural_model.evaluate(X_test.toarray(), y_test)
print(f'Test loss: {loss:.4f}, Test accuracy: {accuracy:.4f}')

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Test loss: 0.3813, Test accuracy: 0.8400


In [28]:
np.random.seed(None)
def example(model, count):
	random_row_indices = np.random.choice(data.shape[0], count, replace=False)
	random_rows = data.iloc[random_row_indices]
	transformed_data = rowsToVectors(random_rows)
	predictions = np.round(model.predict(transformed_data.toarray(), verbose=0), 2)
	for i in range(len(random_row_indices)):
		real_index = random_row_indices[i]
		print(f"{predictions[i]} ({data.iloc[real_index]['target']}): {data.iloc[real_index]['text']}")

In [29]:
num_random_rows = 10
example(neural_model, num_random_rows)

[0.13] (1): Brand man pikin clangoring? ?? ???
[0.07] (0): Watch Sarah Palin OBLITERATE Planned Parenthood For Targeting Minority Women! ÛÒ BB4SP http://t.co/fqMYprlG9g
[0.9] (1): I came up with an idea of a fragrance concept for a bath bomb called The Blood of my Enemies. So you can say that's what you bathe in.
[0.35] (0): #LukeBox something about first responders/ military they are our true Hero's!! Besides your music
[0.14] (0): This real shit will damage a bitch
[0.55] (0): They sky was ablaze tonight in Los Angeles. I'm expecting IG and FB to be filled with sunset shots if I know my peeps!!
[0.02] (0): @Jolly_Jinu you said they are terrorist because of #Babri so was it ok? If you demolish my house todayhave i right to take revenge?
[0.99] (1): US drone bombs Islamic State target in Syria after taking off from Turkey: A US armed drone has bombed a targe... http://t.co/m0daP5xLwo
[0.89] (1): If you ' ray in search of herculean content to improve your business operating theater let