NEP


In [None]:
#datset-https://www.kaggle.com/datasets/rishabh6377/india-national-education-policy2020-tweets-dataset
import numpy as np
import pandas as pd
import re
import string
import matplotlib.pyplot as plt
from textblob import TextBlob
import nltk
from nltk.corpus import stopwords

# Load dataset
education = pd.read_csv("../input/india-national-education-policy2020-tweets-dataset/NEP_2020_english_tweet.csv")

# Download stopwords if needed
nltk.download('stopwords')

# Preparing stopwords, keeping n't words like "hasn't"
stopword_list = [word for word in stopwords.words('english') if "n't" not in word]

# Text Preprocessing
def preprocess(text):
    text = text.strip().lower()
    text = re.sub(r'(@\w+|#\w+|https?://\S+|www\.\S+)', '', text)  # Remove mentions, hashtags, links
    text = text.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))  # Remove punctuation
    words = [word for word in text.split() if word not in stopword_list and len(word) > 1 and not word.isdigit()]
    return " ".join(words)

education['Processed'] = education['Tweet'].apply(preprocess)

# Sentiment Functions
def get_subjectivity(text):
    return TextBlob(text).sentiment.subjectivity

def get_polarity(text):
    return TextBlob(text).sentiment.polarity

def get_sentiment(polarity):
    if polarity < 0:
        return 'Negative'
    elif polarity == 0:
        return 'Neutral'
    else:
        return 'Positive'

# Apply Sentiment Analysis
education['subjectivity'] = education['Processed'].apply(get_subjectivity)
education['polarity'] = education['Processed'].apply(get_polarity)
education['Sentiment'] = education['polarity'].apply(get_sentiment)

# Plotting
plt.figure(figsize=(8,6))
education['Sentiment'].value_counts().plot(kind='bar', color='skyblue')
plt.xlabel("Sentiments")
plt.ylabel("Frequency")
plt.title("Sentiment Analysis of NEP 2020 Tweets")
plt.show()



IPL

In [None]:
#dataset-https://www.kaggle.com/datasets/patrickb1912/ipl-complete-dataset-20082020
import pandas as pd
import matplotlib.pyplot as plt

# Load the data
deliveries = pd.read_csv('/mnt/data/deliveries.csv')

# Basic preprocessing: remove rows where batter or bowler info is missing
deliveries = deliveries.dropna(subset=['batter', 'bowler'])

# --- Plot 1: Total Runs scored by each team ---
team_runs = deliveries.groupby('batting_team')['total_runs'].sum().sort_values(ascending=False)

plt.figure(figsize=(10,6))
team_runs.plot(kind='bar', color='skyblue')
plt.title('Total Runs by Each Batting Team')
plt.xlabel('Batting Team')
plt.ylabel('Total Runs')
plt.xticks(rotation=45)
plt.show()

# --- Plot 2: Total Wickets taken by each bowling team ---
wickets = deliveries[deliveries['is_wicket'] == 1]
team_wickets = wickets['bowling_team'].value_counts()

plt.figure(figsize=(10,6))
team_wickets.plot(kind='bar', color='salmon')
plt.title('Total Wickets Taken by Each Bowling Team')
plt.xlabel('Bowling Team')
plt.ylabel('Wickets Taken')
plt.xticks(rotation=45)
plt.show()


Amazon Product

In [None]:
#dataset-https://www.kaggle.com/datasets/promptcloud/amazon-product-reviews-dataset/data

import pandas as pd
import matplotlib.pyplot as plt
from textblob import TextBlob

# Load the data
reviews = pd.read_csv('/mnt/data/amazon_com-product_reviews__20200101_20200331_sample.csv')

# Keep only necessary columns
reviews = reviews[['Review Content', 'Review Rating']]

# Drop missing reviews
reviews = reviews.dropna(subset=['Review Content'])

# --- Sentiment Analysis ---
def get_polarity(text):
    return TextBlob(text).sentiment.polarity

def get_sentiment(score):
    if score < 0:
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'

# Apply functions
reviews['Polarity'] = reviews['Review Content'].apply(get_polarity)
reviews['Sentiment'] = reviews['Polarity'].apply(get_sentiment)

# --- Plot 1: Sentiment Distribution ---
plt.figure(figsize=(8,5))
reviews['Sentiment'].value_counts().plot(kind='bar', color='lightgreen')
plt.title('Sentiment Analysis of Reviews')
plt.xlabel('Sentiment')
plt.ylabel('Number of Reviews')
plt.show()

# --- Plot 2: Rating Distribution (Pie Chart) ---
plt.figure(figsize=(6,6))
reviews['Review Rating'].value_counts().plot(kind='pie', autopct='%1.1f%%', startangle=140, colors=plt.cm.Paired.colors)
plt.title('Review Rating Distribution')
plt.ylabel('')
plt.show()


Brand Twitter analysis-US Airlines

In [None]:
#dataset-https://www.kaggle.com/datasets/crowdflower/twitter-airline-sentiment/data

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

df = pd.read_csv("/kaggle/input/twitter-airline-sentiment/Tweets.csv")
df.head()
sns.countplot(x="airline_sentiment", data=df, palette="viridis")
plt.title("Sentiment Distribution")
plt.show()

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_tweet(tweet):
    tweet = re.sub(r"http\S+|@\S+|[^a-zA-Z\s]", "", tweet)
    tweet = tweet.lower()
    tweet = " ".join(word for word in tweet.split() if word not in stop_words)
    return tweet

df['clean_text'] = df['text'].apply(clean_tweet)

from wordcloud import WordCloud

for sentiment in ['positive', 'negative', 'neutral']:
    text = " ".join(df[df['airline_sentiment'] == sentiment]['clean_text'])
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
    plt.figure(figsize=(8, 4))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f"WordCloud - {sentiment.capitalize()} Tweets")
    plt.show()

Negative Tweet reviews-Amazon Product

In [None]:
#dataset-https://www.kaggle.com/datasets/promptcloud/amazon-product-reviews-dataset/data
import pandas as pd
import matplotlib.pyplot as plt
from textblob import TextBlob

# Load the data
reviews = pd.read_csv('/mnt/data/amazon_com-product_reviews__20200101_20200331_sample.csv')

# Keep only necessary columns
reviews = reviews[['Review Content', 'Review Rating']]

# Drop missing reviews
reviews = reviews.dropna(subset=['Review Content'])

# --- Sentiment Analysis ---
def get_polarity(text):
    return TextBlob(text).sentiment.polarity

def get_sentiment(score):
    if score < 0:
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'

# Apply functions
reviews['Polarity'] = reviews['Review Content'].apply(get_polarity)
reviews['Sentiment'] = reviews['Polarity'].apply(get_sentiment)
# Filter only negative reviews
negative_reviews = reviews[reviews['Sentiment'] == 'Negative']

# Show few examples
print("Sample Negative Reviews:\n")
print(negative_reviews['Review Content'].head())

# Count how many negative reviews
print("\nTotal Negative Reviews:", negative_reviews.shape[0])

# --- Plot: Most Common Words in Negative Reviews ---

from collections import Counter
import re

# Combine all negative review texts into one string
text = " ".join(negative_reviews['Review Content'].tolist())

# Simple cleaning: remove punctuation and numbers
text = re.sub(r'[^A-Za-z\s]', '', text)

# Split into words
words = text.lower().split()

# Optional: Remove very common English stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
words = [word for word in words if word not in stop_words and len(word) > 2]

# Count word frequency
word_counts = Counter(words)

# Plot top 15 words
top_words = dict(word_counts.most_common(15))

plt.figure(figsize=(10,6))
plt.bar(top_words.keys(), top_words.values(), color='tomato')
plt.xticks(rotation=45)
plt.title('Top Words in Negative Reviews')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.show()


Popularity of local businesses

In [None]:
#Dataset-https://www.kaggle.com/datasets/yogeshrampariya/indian-company-dataset
import pandas as pd
from io import StringIO

# Load the CSV data
df = pd.read_csv("/path")

# Clean review data and convert to numeric
df['review_count'] = df['review'].str.replace('[\(\)k]', '', regex=True)\
    .str.replace(' Reviews', '')\
    .replace('', 0)\
    .apply(lambda x: float(x.replace(' ', '')) * 1000 if 'k' in x else float(x))

# Get top businesses by review count
top_by_reviews = df[['name', 'review_count', 'rating']]\
    .sort_values('review_count', ascending=False)\
    .head(10)

# Get top businesses by rating
top_by_rating = df[['name', 'rating', 'review_count']]\
    .sort_values(['rating', 'review_count'], ascending=[False, False])\
    .head(10)

print("Top 10 Businesses by Popularity (Reviews):")
print(top_by_reviews.to_string(index=False))
print("\nTop 10 Businesses by Customer Satisfaction (Rating):")
print(top_by_rating.to_string(index=False))