In [3]:
import pandas as pd
import numpy as np
import re
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics.pairwise import cosine_similarity
import spacy
from textstat import flesch_kincaid_grade
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from transformers import BertTokenizer, TFBertModel
import tensorflow as tf


1. Data Cleaning
First, load and clean your dataset:

In [None]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('your_dataset.csv')

# Display initial data
print(df.head())

# Basic cleaning
df.dropna(subset=['title', 'selftext'], inplace=True)  # Remove rows with missing 'title' or 'selftext'
df.drop_duplicates(inplace=True)  # Remove duplicate rows

# Remove irrelevant characters (optional)
df['title'] = df['title'].str.replace(r'\W', ' ', regex=True)  # Remove non-word characters
df['selftext'] = df['selftext'].str.replace(r'\W', ' ', regex=True)  # Remove non-word characters

# Reset index after dropping rows
df.reset_index(drop=True, inplace=True)

print(df.head())


In [None]:
# Load the dataset
df = pd.read_csv('your_dataset.csv')

# Data Cleaning
df.dropna(subset=['title', 'selftext'], inplace=True)
df.drop_duplicates(inplace=True)
df['title'] = df['title'].str.replace(r'\W', ' ', regex=True)
df['selftext'] = df['selftext'].str.replace(r'\W', ' ', regex=True)
df.reset_index(drop=True, inplace=True)

df.head()


A. ADHD-Related Keywords Frequency:

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Example keywords related to ADHD
keywords = ['attention', 'hyperactivity', 'impulsivity', 'distraction', 'forgetfulness', 'restlessness', 'talkativeness']

# Function to count keyword occurrences
def count_keywords(text, keywords):
    return sum(text.lower().count(keyword) for keyword in keywords)

# Apply the function to the dataset
df['title_keywords_count'] = df['title'].apply(lambda x: count_keywords(x, keywords))
df['selftext_keywords_count'] = df['selftext'].apply(lambda x: count_keywords(x, keywords))


B. Sentiment Score:

Analyze sentiment using libraries like VADER or TextBlob.

In [None]:
from textblob import TextBlob

def get_sentiment(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity  # Returns a value between -1 (negative) and 1 (positive)

df['title_sentiment'] = df['title'].apply(get_sentiment)
df['selftext_sentiment'] = df['selftext'].apply(get_sentiment)


C. Topic Modeling:

Use Latent Dirichlet Allocation (LDA) for topic modeling.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Vectorize the text data
vectorizer = TfidfVectorizer(stop_words='english')
X_title = vectorizer.fit_transform(df['title'])
X_selftext = vectorizer.fit_transform(df['selftext'])

# Apply LDA
lda_title = LatentDirichletAllocation(n_components=5, random_state=0)
lda_selftext = LatentDirichletAllocation(n_components=5, random_state=0)

lda_title.fit(X_title)
lda_selftext.fit(X_selftext)

# Get the topic distribution for each document
df['title_topic_distribution'] = list(lda_title.transform(X_title))
df['selftext_topic_distribution'] = list(lda_selftext.transform(X_selftext))


D. Named Entity Recognition (NER):

Use spaCy for NER

In [None]:
import spacy

nlp = spacy.load('en_core_web_sm')

def extract_entities(text):
    doc = nlp(text)
    return [ent.text for ent in doc.ents]

df['title_entities'] = df['title'].apply(extract_entities)
df['selftext_entities'] = df['selftext'].apply(extract_entities)


E. Text Complexity Score:

Calculate readability scores like Flesch-Kincaid.

In [None]:
from textstat import flesch_kincaid_grade

df['title_complexity'] = df['title'].apply(flesch_kincaid_grade)
df['selftext_complexity'] = df['selftext'].apply(flesch_kincaid_grade)


F. Contextual Relevance Score:

Use keyword matching or similarity measures to evaluate relevance.
python

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def get_relevance_score(text, keyword_list):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([text] + keyword_list)
    similarity_matrix = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])
    return similarity_matrix.mean()

keywords_list = ['ADHD', 'attention deficit', 'hyperactivity', 'impulsivity']  # Example keywords
df['title_relevance'] = df['title'].apply(lambda x: get_relevance_score(x, keywords_list))
df['selftext_relevance'] = df['selftext'].apply(lambda x: get_relevance_score(x, keywords_list))


G. Emotional Tone:

Use sentiment analysis or specialized libraries.

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

def get_emotional_tone(text):
    sentiment = analyzer.polarity_scores(text)
    return sentiment['compound']  # Returns a value between -1 (negative) and 1 (positive)

df['title_emotional_tone'] = df['title'].apply(get_emotional_tone)
df['selftext_emotional_tone'] = df['selftext'].apply(get_emotional_tone)


H. Question Type:

Classify the type of text if applicable.

In [None]:
def classify_question_type(text):
    if '?' in text:
        return 'Inquiry'
    elif 'complain' in text.lower():
        return 'Complaint'
    else:
        return 'Statement'

df['question_type'] = df['title'].apply(classify_question_type)


Cell 11: Transformer Model Embeddings

In [None]:
# Load Pre-trained Transformer Model
from transformers import BertTokenizer, TFBertModel
import tensorflow as tf

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

def get_bert_embeddings(texts, tokenizer, model, max_len=128):
    encodings = tokenizer(texts.tolist(), truncation=True, padding=True, max_length=max_len, return_tensors='tf')
    outputs = model(encodings['input_ids'], attention_mask=encodings['attention_mask'])
    return outputs.last_hidden_state[:, 0, :]  # Use [CLS] token embeddings

title_embeddings = get_bert_embeddings(df['title'], tokenizer, bert_model)
selftext_embeddings = get_bert_embeddings(df['selftext'], tokenizer, bert_model)

title_embeddings_np = title_embeddings.numpy()
selftext_embeddings_np = selftext_embeddings.numpy()


Cell 12: Combine Features

In [None]:
# Combine Features
title_embeddings_df = pd.DataFrame(title_embeddings_np, index=df.index)
selftext_embeddings_df = pd.DataFrame(selftext_embeddings_np, index=df.index)

feature_df = pd.concat([
    df[['title_keywords_count', 'selftext_keywords_count', 'title_sentiment', 'selftext_sentiment',
        'title_complexity', 'selftext_complexity', 'title_relevance', 'selftext_relevance',
        'title_emotional_tone', 'selftext_emotional_tone']],
    title_embeddings_df,
    selftext_embeddings_df
], axis=1)

# Example Labels (Replace with actual labels)
labels = pd.get_dummies(df['question_type'])


Cell 13: Train-Test Split

In [None]:
# Split Data
X_train, X_test, y_train, y_test = train_test_split(feature_df, labels, test_size=0.3, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape


Cell 14: Define and Train Deep Learning Model

In [None]:
# Define and Train Deep Learning Model
model = Sequential([
    Dense(256, activation='relu', input_dim=feature_df.shape[1]),
    Dense(128, activation='relu'),
    Dense(labels.shape[1], activation='softmax')
])

model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])

history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Plot Training History
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.legend()
plt.title('Loss')
plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.legend()
plt.title('Accuracy')
plt.show()


# Evaluate the Model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy:.4f}')


In [None]:
# Evaluate the Model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy:.4f}')
