NEP


In [None]:
#datset-https://www.kaggle.com/datasets/rishabh6377/india-national-education-policy2020-tweets-dataset
import numpy as np
import pandas as pd
import re
import string
import matplotlib.pyplot as plt
from textblob import TextBlob
import nltk
from nltk.corpus import stopwords

# Load dataset
education = pd.read_csv("../input/india-national-education-policy2020-tweets-dataset/NEP_2020_english_tweet.csv")

# Download stopwords if needed
nltk.download('stopwords')

# Preparing stopwords, keeping n't words like "hasn't"
stopword_list = [word for word in stopwords.words('english') if "n't" not in word]

# Text Preprocessing
def preprocess(text):
    text = text.strip().lower()
    text = re.sub(r'(@\w+|#\w+|https?://\S+|www\.\S+)', '', text)  # Remove mentions, hashtags, links
    text = text.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))  # Remove punctuation
    words = [word for word in text.split() if word not in stopword_list and len(word) > 1 and not word.isdigit()]
    return " ".join(words)

education['Processed'] = education['Tweet'].apply(preprocess)

# Sentiment Functions
def get_subjectivity(text):
    return TextBlob(text).sentiment.subjectivity

def get_polarity(text):
    return TextBlob(text).sentiment.polarity

def get_sentiment(polarity):
    if polarity < 0:
        return 'Negative'
    elif polarity == 0:
        return 'Neutral'
    else:
        return 'Positive'

# Apply Sentiment Analysis
education['subjectivity'] = education['Processed'].apply(get_subjectivity)
education['polarity'] = education['Processed'].apply(get_polarity)
education['Sentiment'] = education['polarity'].apply(get_sentiment)

# Plotting
plt.figure(figsize=(8,6))
education['Sentiment'].value_counts().plot(kind='bar', color='skyblue')
plt.xlabel("Sentiments")
plt.ylabel("Frequency")
plt.title("Sentiment Analysis of NEP 2020 Tweets")
plt.show()



IPL

In [None]:
#dataset-https://www.kaggle.com/datasets/patrickb1912/ipl-complete-dataset-20082020
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
deliveries = pd.read_csv('/content/ipl/deliveries.csv')

# Basic preprocessing: remove missing batter or bowler
deliveries = deliveries.dropna(subset=['batter', 'bowler'])

# --- Plot 1: Total Runs by Each Batting Team ---
team_runs = deliveries.groupby('batting_team')['batsman_runs'].sum().sort_values(ascending=False)

plt.figure(figsize=(10,6))
team_runs.plot(kind='bar', color='skyblue')
plt.title('Total Runs by Each Batting Team')
plt.xlabel('Batting Team')
plt.ylabel('Total Batsman Runs')
plt.xticks(rotation=45)
plt.show()

# --- Plot 2: Top Wicket-Taking Bowling Teams ---
# Assume if a wicket falls (not run out), the bowler gets the credit
wickets = deliveries[deliveries['batsman_runs'] == 0]
team_wickets = wickets['bowling_team'].value_counts()

plt.figure(figsize=(10,6))
team_wickets.plot(kind='bar', color='salmon')
plt.title('Wickets Taken by Each Bowling Team')
plt.xlabel('Bowling Team')
plt.ylabel('Wickets')
plt.xticks(rotation=45)
plt.show()

# --- Plot 3: Top 10 Batters by Total Runs ---
top_batters = deliveries.groupby('batter')['batsman_runs'].sum().sort_values(ascending=False).head(10)

plt.figure(figsize=(10,6))
top_batters.plot(kind='barh', color='lightgreen')
plt.title('Top 10 Batters by Runs')
plt.xlabel('Total Runs')
plt.ylabel('Batter')
plt.gca().invert_yaxis()
plt.show()

# --- Plot 4: Top 10 Bowlers by Wickets ---
top_bowlers = deliveries[deliveries['batsman_runs'] == 0]['bowler'].value_counts().head(10)

plt.figure(figsize=(10,6))
top_bowlers.plot(kind='barh', color='violet')
plt.title('Top 10 Bowlers by Wickets')
plt.xlabel('Wickets')
plt.ylabel('Bowler')
plt.gca().invert_yaxis()
plt.show()

# --- Plot 5: Runs per Over (Overall) ---
runs_per_over = deliveries.groupby('over')['batsman_runs'].sum()

plt.figure(figsize=(10,6))
sns.lineplot(x=runs_per_over.index, y=runs_per_over.values, marker='o', color='orange')
plt.title('Total Runs Scored per Over')
plt.xlabel('Over Number')
plt.ylabel('Runs')
plt.grid(True)
plt.show()


Amazon Product

In [None]:
#dataset-https://www.kaggle.com/datasets/promptcloud/amazon-product-reviews-dataset/data

import pandas as pd
import matplotlib.pyplot as plt
from textblob import TextBlob

# Load the data
reviews = pd.read_csv('/mnt/data/amazon_com-product_reviews__20200101_20200331_sample.csv')

# Keep only necessary columns
reviews = reviews[['Review Content', 'Review Rating']]

# Drop missing reviews
reviews = reviews.dropna(subset=['Review Content'])

# --- Sentiment Analysis ---
def get_polarity(text):
    return TextBlob(text).sentiment.polarity

def get_sentiment(score):
    if score < 0:
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'

# Apply functions
reviews['Polarity'] = reviews['Review Content'].apply(get_polarity)
reviews['Sentiment'] = reviews['Polarity'].apply(get_sentiment)

# --- Plot 1: Sentiment Distribution ---
plt.figure(figsize=(8,5))
reviews['Sentiment'].value_counts().plot(kind='bar', color='lightgreen')
plt.title('Sentiment Analysis of Reviews')
plt.xlabel('Sentiment')
plt.ylabel('Number of Reviews')
plt.show()

# --- Plot 2: Rating Distribution (Pie Chart) ---
plt.figure(figsize=(6,6))
reviews['Review Rating'].value_counts().plot(kind='pie', autopct='%1.1f%%', startangle=140, colors=plt.cm.Paired.colors)
plt.title('Review Rating Distribution')
plt.ylabel('')
plt.show()


Brand Twitter analysis-US Airlines

In [None]:
#dataset-https://www.kaggle.com/datasets/crowdflower/twitter-airline-sentiment/data

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

df = pd.read_csv("/kaggle/input/twitter-airline-sentiment/Tweets.csv")
df.head()
sns.countplot(x="airline_sentiment", data=df, palette="viridis")
plt.title("Sentiment Distribution")
plt.show()

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_tweet(tweet):
    tweet = re.sub(r"http\S+|@\S+|[^a-zA-Z\s]", "", tweet)
    tweet = tweet.lower()
    tweet = " ".join(word for word in tweet.split() if word not in stop_words)
    return tweet

df['clean_text'] = df['text'].apply(clean_tweet)

from wordcloud import WordCloud

for sentiment in ['positive', 'negative', 'neutral']:
    text = " ".join(df[df['airline_sentiment'] == sentiment]['clean_text'])
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
    plt.figure(figsize=(8, 4))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f"WordCloud - {sentiment.capitalize()} Tweets")
    plt.show()

Negative Tweet reviews-Amazon Product

In [None]:
#dataset-https://www.kaggle.com/datasets/promptcloud/amazon-product-reviews-dataset/data
import pandas as pd
import matplotlib.pyplot as plt
from textblob import TextBlob

# Load the data
reviews = pd.read_csv('/mnt/data/amazon_com-product_reviews__20200101_20200331_sample.csv')

# Keep only necessary columns
reviews = reviews[['Review Content', 'Review Rating']]

# Drop missing reviews
reviews = reviews.dropna(subset=['Review Content'])

# --- Sentiment Analysis ---
def get_polarity(text):
    return TextBlob(text).sentiment.polarity

def get_sentiment(score):
    if score < 0:
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'

# Apply functions
reviews['Polarity'] = reviews['Review Content'].apply(get_polarity)
reviews['Sentiment'] = reviews['Polarity'].apply(get_sentiment)
# Filter only negative reviews
negative_reviews = reviews[reviews['Sentiment'] == 'Negative']

# Show few examples
print("Sample Negative Reviews:\n")
print(negative_reviews['Review Content'].head())

# Count how many negative reviews
print("\nTotal Negative Reviews:", negative_reviews.shape[0])

# --- Plot: Most Common Words in Negative Reviews ---

from collections import Counter
import re

# Combine all negative review texts into one string
text = " ".join(negative_reviews['Review Content'].tolist())

# Simple cleaning: remove punctuation and numbers
text = re.sub(r'[^A-Za-z\s]', '', text)

# Split into words
words = text.lower().split()

# Optional: Remove very common English stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
words = [word for word in words if word not in stop_words and len(word) > 2]

# Count word frequency
word_counts = Counter(words)

# Plot top 15 words
top_words = dict(word_counts.most_common(15))

plt.figure(figsize=(10,6))
plt.bar(top_words.keys(), top_words.values(), color='tomato')
plt.xticks(rotation=45)
plt.title('Top Words in Negative Reviews')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.show()


Popularity of local businesses

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import re

# Load the data
data = pd.read_csv('/content/Ambition Box.csv')

data = data.drop(columns=['Unnamed: 0'])

data['review'] = data['review'].apply(lambda x: float(re.sub(r'[^\d\.]', '', x)))

data['company_type'] = data['company_type'].str.strip().str.capitalize()

data['Company_Age'] = data['Company_Age'].apply(lambda x: int(re.search(r'\d+', x).group()) if pd.notnull(x) else None)

data['No_of_Employee'] = data['No_of_Employee'].str.strip()

# Top 10 companies by rating
top_companies = data.sort_values(by='rating', ascending=False).head(10)

plt.figure(figsize=(10,6))
plt.barh(top_companies['name'], top_companies['rating'], color='cornflowerblue')
plt.xlabel('Rating')
plt.title('Top 10 Companies by Rating')
plt.gca().invert_yaxis()
plt.show()

# Rating vs Reviews Scatter Plot
plt.figure(figsize=(8,6))
plt.scatter(data['review'], data['rating'], color='seagreen')
plt.xlabel('Number of Reviews (in Thousands)')
plt.ylabel('Rating')
plt.title('Rating vs Number of Reviews')
plt.grid(True)
plt.show()


Hotel Tweet Analysis

In [None]:
# Dataset: https://www.kaggle.com/code/jonathanoheix/sentiment-analysis-with-hotel-reviews/input

import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns

nltk.download('vader_lexicon')

# Load the dataset
df = pd.read_csv('/content/SMA/unzipped_folder/Hotel_Reviews.csv')

df = df.head(100)

sid = SentimentIntensityAnalyzer()

def get_sentiment_score(text):
    if pd.isnull(text) or text.strip() == '':
        return 0
    return sid.polarity_scores(text)['compound']

# Apply sentiment analysis
df['Positive_Sentiment'] = df['Positive_Review'].apply(get_sentiment_score)
df['Negative_Sentiment'] = df['Negative_Review'].apply(get_sentiment_score)

positive_text = " ".join(df['Positive_Review'].dropna().tolist())

negative_text = " ".join(df['Negative_Review'].dropna().tolist())

wordcloud_pos = WordCloud(width=800, height=400, background_color='white', colormap='Greens').generate(positive_text)

plt.figure(figsize=(10,5))
plt.imshow(wordcloud_pos, interpolation='bilinear')
plt.axis('off')
plt.title('WordCloud for Positive Reviews')
plt.show()

# Generate WordCloud for Negative Reviews
wordcloud_neg = WordCloud(width=800, height=400, background_color='white', colormap='Reds').generate(negative_text)

plt.figure(figsize=(10,5))
plt.imshow(wordcloud_neg, interpolation='bilinear')
plt.axis('off')
plt.title('WordCloud for Negative Reviews')
plt.show()

sns.set(style="whitegrid")

# Plot Positive Sentiment Distribution
plt.figure(figsize=(8,5))
sns.histplot(df['Positive_Sentiment'], bins=20, kde=True, color='green')
plt.title('Distribution of Positive Sentiment Scores')
plt.xlabel('Sentiment Score')
plt.ylabel('Frequency')
plt.show()

# Plot Negative Sentiment Distribution
plt.figure(figsize=(8,5))
sns.histplot(df['Negative_Sentiment'], bins=20, kde=True, color='red')
plt.title('Distribution of Negative Sentiment Scores')
plt.xlabel('Sentiment Score')
plt.ylabel('Frequency')
plt.show()


Sentiment analysis of product

In [None]:
#dataset-https://www.kaggle.com/code/benroshan/sentiment-analysis-amazon-reviews/input?select=Musical_instruments_reviews.csv
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import seaborn as sns

# Download necessary NLTK data
nltk.download('vader_lexicon')

# Load the dataset
df = pd.read_csv('/content/Musical_instruments_reviews.csv')

# Initialize VADER sentiment analyzer
sid = SentimentIntensityAnalyzer()

# Function to get sentiment score
def get_sentiment(text):
    if pd.isnull(text) or text.strip() == '':
        return 0
    return sid.polarity_scores(text)['compound']

# Apply sentiment analysis on reviewText
df['sentiment_score'] = df['reviewText'].apply(get_sentiment)

# Classify sentiment
def classify_sentiment(score):
    if score >= 0.05:
        return 'Positive'
    elif score <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

df['sentiment_label'] = df['sentiment_score'].apply(classify_sentiment)

# Display sample output
print(df[['reviewerName', 'reviewText', 'sentiment_score', 'sentiment_label']].head())

# --- Visualization 1: Sentiment Distribution ---
plt.figure(figsize=(8,5))
sns.countplot(x='sentiment_label', data=df, palette='Set2')
plt.title('Sentiment Distribution of Musical Instrument Reviews')
plt.xlabel('Sentiment')
plt.ylabel('Number of Reviews')
plt.show()

# --- Visualization 2: WordCloud for Positive Reviews ---
positive_reviews = " ".join(df[df['sentiment_label']=='Positive']['reviewText'].dropna().tolist())

wordcloud_pos = WordCloud(width=800, height=400, background_color='white', colormap='Greens').generate(positive_reviews)

plt.figure(figsize=(10,5))
plt.imshow(wordcloud_pos, interpolation='bilinear')
plt.axis('off')
plt.title('WordCloud for Positive Reviews')
plt.show()


Social Network data analysis   

In [None]:
#dataset-https://snap.stanford.edu/data/email-Eu-core.html
import networkx as nx
import gzip
import matplotlib.pyplot as plt

# Load dataset
def load_email_network(file_path):
    G = nx.Graph()
    with gzip.open(file_path, 'rt') as f:
        for line in f:
            u, v = map(int, line.strip().split())
            G.add_edge(u, v)
    return G

# Compute centrality measures
def compute_centrality_measures(G):
    degree_centrality = nx.degree_centrality(G)
    betweenness_centrality = nx.betweenness_centrality(G)
    closeness_centrality = nx.closeness_centrality(G)
    eigenvector_centrality = nx.eigenvector_centrality(G, max_iter=1000)

    return {
        "degree": degree_centrality,
        "betweenness": betweenness_centrality,
        "closeness": closeness_centrality,
        "eigenvector": eigenvector_centrality
    }

# Perform community detection
def detect_communities(G):
    from networkx.algorithms.community import greedy_modularity_communities
    communities = list(greedy_modularity_communities(G))
    return communities

# Visualize network
def visualize_network(G, title="Email Network"):
    plt.figure(figsize=(10, 8))
    pos = nx.spring_layout(G, seed=42)
    nx.draw(G, pos, node_size=50, edge_color="gray", with_labels=False)
    plt.title(title)
    plt.show()

# Main Execution
file_path = "/content/email-Eu-core.txt.gz"
G = load_email_network(file_path)
centrality_measures = compute_centrality_measures(G)
communities = detect_communities(G)
visualize_network(G)

# Print results as per the PDF output format
print("Degree Centrality:")
top_degree = sorted(centrality_measures["degree"].items(), key=lambda x: x[1], reverse=True)[:5]
for node, centrality in top_degree:
    print(f"Node {node}: {centrality:.4f}")

print("\nBetweenness Centrality:")
top_betweenness = sorted(centrality_measures["betweenness"].items(), key=lambda x: x[1], reverse=True)[:5]
for node, centrality in top_betweenness:
    print(f"Node {node}: {centrality:.4f}")

print("\nCloseness Centrality:")
top_closeness = sorted(centrality_measures["closeness"].items(), key=lambda x: x[1], reverse=True)[:5]
for node, centrality in top_closeness:
    print(f"Node {node}: {centrality:.4f}")

print("\nEigenvector Centrality:")
top_eigenvector = sorted(centrality_measures["eigenvector"].items(), key=lambda x: x[1], reverse=True)[:5]
for node, centrality in top_eigenvector:
    print(f"Node {node}: {centrality:.4f}")

print("\nDetected Communities (First 5 groups shown):")
for i, community in enumerate(communities[:5]):
    print(f"Community {i + 1}: {sorted(community)[:10]} ...")


Predictive model for airline tweet

In [None]:
#dataset-https://www.kaggle.com/datasets/crowdflower/twitter-airline-sentiment/data
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt

# Load dataset
df = pd.read_csv('/content/Tweets.csv')

# Look at important columns
print(df.columns)

# Use only 'text' and 'airline_sentiment' columns
df = df[['text', 'airline_sentiment']]

# Remove missing values
df.dropna(inplace=True)

# Split into input and output
X = df['text']
y = df['airline_sentiment']

# Convert text into numbers (TF-IDF)
vectorizer = TfidfVectorizer(stop_words='english')
X_vect = vectorizer.fit_transform(X)

# Split into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_vect, y, test_size=0.2, random_state=42)

# Train a Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Show classification report
print(classification_report(y_test, y_pred))

# Plot sentiment distribution
df['airline_sentiment'].value_counts().plot(kind='bar', color=['red', 'blue', 'green'])
plt.title('Sentiment Distribution')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()


Coaching class reviews


In [None]:
#dataset-https://www.kaggle.com/datasets/septa97/100k-courseras-course-reviews-dataset
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Download required NLTK data
nltk.download('vader_lexicon')

# Load dataset
df = pd.read_csv('/content/coaching/reviews.csv')  # replace with your file name

# Basic info
print(df.head())
df=df.head(100)

# --- Sentiment Analysis ---
sid = SentimentIntensityAnalyzer()

def get_sentiment_score(text):
    if pd.isnull(text) or text.strip() == '':
        return 0
    return sid.polarity_scores(text)['compound']

# Add sentiment score
df['Sentiment_Score'] = df['Review'].apply(get_sentiment_score)

# Classify into Positive, Neutral, Negative based on score
def classify(score):
    if score > 0.2:
        return 'Positive'
    elif score < -0.2:
        return 'Negative'
    else:
        return 'Neutral'

df['Sentiment'] = df['Sentiment_Score'].apply(classify)

# --- Visualization ---

# 1. Sentiment distribution
df['Sentiment'].value_counts().plot(kind='bar', color=['green', 'blue', 'red'])
plt.title('Sentiment Distribution')
plt.xlabel('Sentiment')
plt.ylabel('Number of Reviews')
plt.show()

# 2. WordCloud for all reviews
text = " ".join(review for review in df['Review'].dropna())
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)

plt.figure(figsize=(10,5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('WordCloud of Course Reviews')
plt.show()

# 3. Distribution of Labels
df['Label'].value_counts().sort_index().plot(kind='bar', color='purple')
plt.title('Rating Label Distribution')
plt.xlabel('Label (Rating)')
plt.ylabel('Count')
plt.show()
