In [1]:
import nltk
import pandas as pd
import numpy as np
import re
# import matplotlib.pyplot as plt
from nltk.corpus import twitter_samples, stopwords
from nltk.tokenize import TweetTokenizer
#from wordcloud import WordCloud
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer




In [2]:
nltk.download('twitter_samples')
nltk.download('stopwords')
# Load positive and negative tweets
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')


[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\KIIT\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\KIIT\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
data = pd.DataFrame({
    'tweet': positive_tweets + negative_tweets,
    'label': [1] * len(positive_tweets) + [0] * len(negative_tweets)
})

In [4]:
print("\nSample Tweets:\n", data.head())
print("\nLabel Distribution:\n", data['label'].value_counts())



Sample Tweets:
                                                tweet  label
0  #FollowFriday @France_Inte @PKuchly57 @Milipol...      1
1  @Lamb2ja Hey James! How odd :/ Please call our...      1
2  @DespiteOfficial we had a listen last night :)...      1
3                               @97sides CONGRATS :)      1
4  yeaaaah yippppy!!!  my accnt verified rqst has...      1

Label Distribution:
 label
1    5000
0    5000
Name: count, dtype: int64


In [5]:
from nltk.stem import WordNetLemmatizer
stop_words = set(stopwords.words('english'))
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
lemmatizer = WordNetLemmatizer()

In [6]:
# Function to clean tweets
def preprocess_tweet(tweet):
# Remove URLs and mentions
    tweet = re.sub(r'http\S+|www\S+|@\S+', '', tweet)  
    tokens = tokenizer.tokenize(tweet)
    clean_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and word.isalpha()]
    return ' '.join(clean_tokens)

In [7]:
positive_cleaned = [preprocess_tweet(tweet) for tweet in positive_tweets]


In [8]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\KIIT\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
negative_cleaned = [preprocess_tweet(tweet) for tweet in negative_tweets]

In [10]:
# Create cleaned dataset
cleaned_data = pd.DataFrame({
    'tweet': positive_cleaned + negative_cleaned,
    'label': [1] * len(positive_cleaned) + [0] * len(negative_cleaned)
})

print("\nCleaned Tweets Sample:\n", cleaned_data.head())



Cleaned Tweets Sample:
                                                tweet  label
0                  top engaged member community week      1
1  hey james odd please call contact centre able ...      1
2     listen last night bleed amazing track scotland      1
3                                           congrats      1
4  yeaaah yipppy accnt verified rqst succeed got ...      1


In [11]:
# Feature Extraction: TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(cleaned_data['tweet']).toarray()
y = cleaned_data['label']


In [12]:
# Train-Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# Logistic Regression
from sklearn.metrics import  accuracy_score
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
print("\nLogistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))



Logistic Regression Accuracy: 0.745


In [14]:
# Function to predict user input sentiment
def predict_sentiment(input_text, model, vectorizer):
    cleaned_text = preprocess_tweet(input_text)
    text_features = vectorizer.transform([cleaned_text])
    prediction = model.predict(text_features)[0]
    return "Positive" if prediction == 1 else "Negative"

In [15]:
from pymongo import MongoClient
# MongoDB Connection
client = MongoClient("mongodb+srv://ktaunk28:qwerty123@cluster0.dim29.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0")  # Local MongoDB instance
db = client["journaldb"]  # Replace 'sentiment_db' with your database name
collection = db["users"]  # Replace 'input_tweets' with your collection name

In [19]:
from pymongo import MongoClient
from bson.objectid import ObjectId

# Connect to MongoDB
client = MongoClient("mongodb+srv://ktaunk28:qwerty123@cluster0.dim29.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0")  # Replace with your MongoDB URI if needed
db = client['journaldb']  # Connect to the journaldb database
users_collection = db['users']  # Access the users collection
journal_entries_collection = db['journal_Entries']  # Access the journal_Entries collection

# Fetch a specific user by ID
user_id = ObjectId("67700c6266604f272107a387")  # Replace with the actual user ObjectId
user_document = users_collection.find_one({"_id": user_id})

# Ensure user exists and has journal entries
if user_document and "journalEntries" in user_document:
    journal_entries_refs = user_document["journalEntries"]  # Array of DBRef objects

    # Fetch the journal entries from the referenced collection
    journal_entries = []
    for ref in journal_entries_refs:
        entry_id = ref.id  # Get the ObjectId of the journal entry
        entry_document = journal_entries_collection.find_one({"_id": entry_id})
        if entry_document:
            journal_entries.append(entry_document)

    # Example: Display fetched journal entries
    for entry in journal_entries:
        print(entry)

    # Process and analyze each journal entry
    def analyze_journal_entry(entry_text):
        cleaned_entry = preprocess_tweet(entry_text)  # Assuming a preprocess_tweet() function
        features = vectorizer.transform([cleaned_entry])  # Vectorize the text
        prediction = lr_model.predict(features)[0]  # Predict sentiment
        return "Positive" if prediction == 1 else "Negative"

    # Perform sentiment analysis
    analysis_results = []
    for entry in journal_entries:
        if "text" in entry:  # Assuming each journal entry has a "text" field
            sentiment = analyze_journal_entry(entry["text"])
            analysis_results.append({"entry_id": entry["_id"], "sentiment": sentiment})

    # Save the analysis results back to the user's document
    users_collection.update_one(
        {"_id": user_id},
        {"$set": {"journal_Analysis": analysis_results}}  # Add a new field for analysis results
    )
    print("Sentiment analysis results saved to MongoDB.")
else:
    print("User not found or no journal entries available.")


{'_id': ObjectId('6770d7c09138563bc210298c'), 'title': 'string', 'date': datetime.datetime(2024, 12, 29, 5, 1, 52, 457000), 'content': 'string', 'sentiment': '', '_class': 'net.khushtaunk.journalApp.Entity.journalEntry'}
{'_id': ObjectId('67715b04124a453fbd2c0573'), 'title': 'lets edit this', 'date': datetime.datetime(2024, 12, 29, 14, 21, 56, 900000), 'content': 'okkk edited', 'sentiment': '', '_class': 'net.khushtaunk.journalApp.Entity.journalEntry'}
{'_id': ObjectId('67715d47124a453fbd2c0574'), 'title': 'hello', 'date': datetime.datetime(2024, 12, 29, 14, 31, 35, 317000), 'content': 'helloooo', 'sentiment': '', '_class': 'net.khushtaunk.journalApp.Entity.journalEntry'}
{'_id': ObjectId('67717576239af27b2cd4f61c'), 'title': 'just test', 'date': datetime.datetime(2024, 12, 29, 16, 14, 46, 450000), 'content': 'doing this for teting', 'sentiment': '', '_class': 'net.khushtaunk.journalApp.Entity.journalEntry'}
{'_id': ObjectId('6776b37a058cba56ef8811e6'), 'title': 'hey this is khush', 'd

In [20]:
# Apply analysis to the content field of all journal entries
results = []

# Loop through journal entries and analyze their 'content' fields
for entry in journal_entries:
    if "content" in entry:  # Ensure the 'content' field exists
        sentiment = analyze_journal_entry(entry["content"])  # Pass only the 'content' field
        results.append({
            "entry_id": entry["_id"],  # Include the unique ID of the journal entry
            "content": entry["content"],  # Include the original content for reference
            "sentiment": sentiment       # Include the sentiment analysis result
        })
    else:
        print(f"Entry with ID {entry['_id']} has no 'content' field.")  # Log missing 'content' fields

print("Analysis Results:", results)


Analysis Results: [{'entry_id': ObjectId('6770d7c09138563bc210298c'), 'content': 'string', 'sentiment': 'Negative'}, {'entry_id': ObjectId('67715b04124a453fbd2c0573'), 'content': 'okkk edited', 'sentiment': 'Negative'}, {'entry_id': ObjectId('67715d47124a453fbd2c0574'), 'content': 'helloooo', 'sentiment': 'Negative'}, {'entry_id': ObjectId('67717576239af27b2cd4f61c'), 'content': 'doing this for teting', 'sentiment': 'Negative'}, {'entry_id': ObjectId('6776b37a058cba56ef8811e6'), 'content': 'am just writing this to make me happy', 'sentiment': 'Positive'}]


In [21]:
# Update the journal entries with sentiment analysis results
for result in results:
    db["journal_Entries"].update_one(
        {"_id": result["entry_id"]},
        {"$set": {"sentiment": result["sentiment"]}}
    )
