# Library imports

In [12]:
import pandas as pd
import numpy as np
import string
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import recall_score, precision_score, f1_score

# Dataset Writer

In [13]:
# Enhanced varied content
topics = ["machine learning", "cooking", "deep learning", "woodworking", "neural networks","technology","sports","travel","bach","politics","full stack developer","Data Science","AI","Data Analysis","Data Engineer"]
modifiers = ["Latest trends in", "Fundamental concepts of", "Advanced techniques in", "Introduction to", "Practical guide to"]
verbs = ["explores", "discusses", "introduces", "explains", "covers"]

# Generate 500 samples with more variation
content = []
for _ in range(1500):
    topic = np.random.choice(topics)
    modifier = np.random.choice(modifiers)
    verb = np.random.choice(verbs)
    phrase = f"{modifier} {topic} {verb}"
    content.append(phrase)

np.random.shuffle(content)

# Generate labels, where label is 1 if 'learning', 'deep', or 'neural' is in the text, else 0
labels = [1 if any(word in text for word in ["learning", "deep", "neural"]) else 0 for text in content]

# Create DataFrame
df = pd.DataFrame({
    "content": content,
    "label": labels
})

# Save to CSV
df.to_csv('search_engine_data.csv', index=False)

# Display the DataFrame
df


# Read Dataset

In [14]:
df=pd.read_csv('search_engine_data.csv')
df

Unnamed: 0,content,label
0,Latest trends in politics introduces,0
1,Latest trends in deep learning explains,1
2,Advanced techniques in bach introduces,0
3,Practical guide to woodworking explores,0
4,Practical guide to Data Engineer introduces,0
...,...,...
1495,Fundamental concepts of Data Science discusses,0
1496,Fundamental concepts of neural networks explains,1
1497,Fundamental concepts of sports discusses,0
1498,Introduction to woodworking discusses,0


# Preprocessing

In [15]:
# Preprocessing function to clean text
def preprocess_text(text):
    # convert text to lower-case
    text = text.lower()  
    # remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])  
    # tokenize by spaces
    tokens = text.split()  
    # load stopwords
    stop_words = stopwords.words('english')  
    tokens = [token for token in tokens if token not in stop_words]  =
    return ' '.join(tokens)

# Preprocess the content
df['content'] = df['content'].apply(preprocess_text)

# TF-IDF and Machine Learning Model Training with L2 regularization

In [16]:
# TF-IDF Vectorization with adjusted parameters
tfidf_vectorizer = TfidfVectorizer(lowercase=True, stop_words='english', ngram_range=(1, 2), max_features=500)
# Split data into training and testing sets
X = tfidf_vectorizer.fit_transform(df['content'])
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize and train Logistic Regression with L2 regularization
model = LogisticRegression(random_state=42, penalty='l2', C=0.1)
model.fit(X_train, y_train)

# Predict Model

In [17]:
# Predict on the test set
    y_pred = model.predict(X_test)

# Evaluate the model

In [18]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
# Optionally perform k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cross_val_scores = cross_val_score(model, X, y, cv=kf)
from sklearn.metrics import recall_score, precision_score, f1_score

# Calculate recall, precision, and F1-score
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"Recall: {recall:.2f}")
print(f"Precision: {precision:.2f}")
print(f"F1-score: {f1:.2f}")
print(f"Test Accuracy: {accuracy:.2f}")
print(f"Cross-Validation Accuracy Scores: {cross_val_scores}")
print(f"Average Cross-Validation Accuracy: {np.mean(cross_val_scores):.4f}")


Recall: 0.89
Precision: 1.00
F1-score: 0.94
Test Accuracy: 0.98
Cross-Validation Accuracy Scores: [0.98       0.98666667 0.97666667 0.97       0.9       ]
Average Cross-Validation Accuracy: 0.9627


# Write a function to search and predict 

In [19]:
def search_and_predict(query, tfidf_vectorizer, model, df, top_n=5):
    # Transform the query into TF-IDF vector
    query_vector = tfidf_vectorizer.transform([query])
    
    # Compute cosine similarities for query-document similarity
    cosine_similarities = cosine_similarity(query_vector, tfidf_vectorizer.transform(df['content'])).flatten()
    
    # Get model's prediction probabilities for documents being relevant
    probabilities = model.predict_proba(tfidf_vectorizer.transform(df['content']))[:, 1]
    
    # Combine probabilities with cosine similarities by multiplication to rank documents
    relevance_scores = probabilities * cosine_similarities
    
    # Get indices of top N relevant documents based on relevance scores
    top_indices = np.argsort(-relevance_scores)[:top_n]
    top_similarities = cosine_similarities[top_indices]
    top_probabilities = probabilities[top_indices]
    top_documents = df['content'].iloc[top_indices]
    
    # Prepare the results DataFrame
    results_df = pd.DataFrame({
        'document_id': top_indices,
        'similarity': top_similarities,
        'probability': top_probabilities,
        'document': top_documents
    })

    return results_df



# test the function and show results


In [20]:
#take user input
query = input("Enter your query: ")
results = search_and_predict(query, tfidf_vectorizer, model, df, top_n=5)
results

Enter your query: AI


Unnamed: 0,document_id,similarity,probability,document
635,635,0.400195,0.119516,introduction ai discusses
402,402,0.400195,0.119516,introduction ai discusses
1029,1029,0.405926,0.117635,introduction ai introduces
828,828,0.405926,0.117635,introduction ai introduces
667,667,0.405926,0.117635,introduction ai introduces
