In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
##reading data
df = pd.read_excel('/kaggle/input/adobebehaviorsimulationtrain/behaviour_simulation_train.xlsx')

In [None]:
##showing the data
df.head()

In [None]:
##showing the statictic data of the columns
df.describe(include = [np.dtype(object)])

In [None]:
##check the numerical datas only
df.describe()


In [None]:
df['count']=np.ones(len(df))

In [None]:
##barplot(frequency in x direction: taken from kaggle)
def drawbarplot(x,y,xlabel,title,figsize=(10,10)):
    plt.figure(figsize=figsize)    
    sns.barplot(x=x,y=y,palette = 'terrain',orient='h',order=y)
    for i,v in enumerate(x):
        plt.text(0.8,i,v,color='k',fontsize=10)
    
    plt.title(title,fontsize=20)
    plt.xlabel(xlabel,fontsize =14)
    plt.show()

In [None]:
##group by companies
k = df.groupby(['inferred company'])

In [None]:
##find total user names associated with each company
l = []
for i in range(len(k['inferred company'].unique())):
    l.append(k['username'].unique()[i].size)




In [None]:
#unique usernames and data
usernames = pd.Series(l)
usernames.describe()


In [None]:
#mostly usernames for each company is less than 20
plt.hist(usernames,range=(0,150))
plt.show()

In [None]:
##appending unique company to every username possible of the company
namelist=[]
for i in range(220):
    namelist.append(k['inferred company'].unique()[i][0])

companyuser = pd.DataFrame(data={'company':namelist,'users':l})

companyuser =companyuser.sort_values(by =['users'],ascending=False)
companyuser=companyuser.iloc[:50,:]


In [None]:
##top 50 company having most users
drawbarplot(x = companyuser['users'],y = companyuser['company'],xlabel ='users',title = 'company vs users')

In [None]:
##box plot of number of likes
sns.boxplot(df['likes'])

In [None]:
##plotting the pdf  and cdf plot of likes
j = df['likes'].values
j.sort()

count, bins_count = np.histogram(j, bins=10) 
  
# finding the PDF of the histogram using count values 
pdf = count / sum(count) 
  
# using numpy np.cumsum to calculate the CDF 
# We can also find using the PDF values by looping and adding 
cdf = np.cumsum(pdf) 
  
# plotting PDF and CDF 
plt.plot(bins_count[1:], pdf, color="red", label="PDF") 
plt.plot(bins_count[1:], cdf, label="CDF") 
plt.legend() 

##almost all the values are at start 

In [None]:
#plot of the values of the ith index of number of likes
plt.plot(range(1,300001),j)
plt.show()


In [None]:
df['likes'].describe()

In [None]:
##calculate total outliers in this dataset using IQR
IQR = 364-3
upperfence = 364+1.5*IQR
lowerfence = max(0,3-1.5*IQR)
k1 = [value>upperfence or value<lowerfence for value in df['likes']]
print("the number of outliers by using interquartile method :",sum(k1))



In [None]:
likelist=[]
for i in range(220):
    likelist.append(k['likes'].sum()[i])

likecompany = pd.DataFrame(data={'company':namelist,'likelist':likelist})

likecompany =likecompany.sort_values(by =['likelist'],ascending=False)
likecompany=likecompany.iloc[:50,:]
likecompany

In [None]:
##top 50 most number of likes
drawbarplot(x = likecompany['likelist'],y = likecompany['company'],xlabel ='users',title = 'company vs like')

In [None]:
##total number of tweets
df.groupby(['inferred company'])['likes'].count()

In [None]:
##top 50 most tweeted accounts
countlist=[]
for i in range(220):
    countlist.append(k['likes'].count()[i])

countcompany = pd.DataFrame(data={'company':namelist,'countlist':countlist})

countcompany =countcompany.sort_values(by =['countlist'],ascending=False)
countcompany=countcompany.iloc[:50,:]
countcompany

In [None]:
#plot of top 50 most tweeted companies
drawbarplot(x = countcompany['countlist'],y = countcompany['company'],xlabel ='tweet per company',title = 'company vs count of tweets')

In [None]:
##plots the distribution of all the likes over time by username/company
def plotwithtime_user(username,df):
    nd = df.loc[df['username']==username,:]
    nd.loc[:,'date']=pd.to_datetime(nd['date'])
    nd =nd.set_index(nd['date'])
    g = nd.groupby(pd.Grouper(freq="M"))
    ndp =g['likes'].sum()


    plt.figure(figsize = (10,6))
    plt.plot_date(ndp.index,ndp,linestyle='solid')
    plt.show()
def plotwithtime_company(username,df):
    nd = df.loc[df['inferred company']==username,:]
    nd.loc[:,'date']=pd.to_datetime(nd['date'])
    nd =nd.set_index(nd['date'])
    g = nd.groupby(pd.Grouper(freq="M"))
    ndp =g['likes'].sum()


    plt.figure(figsize = (10,6))
    plt.plot_date(ndp.index,ndp,linestyle='solid')
    plt.show()
    

In [None]:
##just to check usernames
df

In [None]:
##applied using the above function
plotwithtime_user('CBCOlympics',df)

In [None]:
plotwithtime_company('cbc',df)

In [None]:
##sentiment analysis using bert
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import re

In [None]:
tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

In [None]:
textclasses = []#stores the different classes of sentiment analysis 0 for negative tweet 5 for positive
textlikes = []##stores the likes for that corresponding tweet
counter =0
##random 2000 sampling because computing was taking too much time
for i in range(2000):
    ind = np.random.randint(0,len(df))
    text = df['content'][ind]
    textlikes.append(df['likes'][ind])
    counter=counter+1
    tokens = tokenizer.encode(text[:100], return_tensors='pt')
    result = model(tokens)
    textclasses.append(int(torch.argmax(result.logits))+1)

In [None]:
##grouping by sentiment to check the average likes on each sentiment
likevssentiment = pd.DataFrame({'like':textlikes,'sentiment':textclasses})
likevssentiment.groupby(['sentiment']).mean()

In [None]:
#frequency of all types of  tweets in random sampling of 2000

sns.countplot(x = "sentiment",data=likevssentiment)

In [None]:
likevssentiment.groupby(['sentiment']).count() 

In [None]:
likevssentiment.corr() #less correlation between sentiment and number of likes

In [None]:
##using length of tweet as data feature

corrdf = pd.DataFrame()
corrdf['likes'] = df['likes']
tl = []
for text in df['content']:
    tl.append(len(text))
corrdf['tweetlength'] = tl
df['tweetlength'] = tl
plt.scatter(range(1,300001),df.tweetlength)

In [None]:
#now grouping by company to check if correlation is there or not company wise
corrdf['company'] = df['inferred company']
companylm=corrdf.groupby(['company'])['tweetlength'].mean()
companylim=corrdf.groupby(['company'])['likes'].mean()



##finding relation between likes and tweet length of different companies
corr2=pd.DataFrame({'twtmean':companylm,'likemean':companylim})
plt.scatter(corr2['twtmean'],corr2['likemean'])
plt.xlabel('company twt length mean')
plt.ylabel('company likes mean')
plt.show()

In [None]:
##calculating frequency of the number of tweets
companytwt=corrdf.groupby(['company'])['likes'].count()
corr2['count'] = companytwt.values
corr2

In [None]:
corr2.corr() ##intercompany relation is less

In [None]:
plt.scatter(corr2['likemean'],corr2['count'])
plt.ylabel('company twt total')
plt.xlabel('company likes mean')
plt.show()

In [None]:
##checking for intra company data to see if it changes through time

def likesandlengths(company,df):
    nd = df.loc[df['inferred company']==company,:]
    nd['date']=pd.to_datetime(nd['date'])
    nd =nd.set_index(nd['date'])
    g = nd.groupby(pd.Grouper(freq="M"))
    ndp =g['likes'].sum()
    ndt=g['tweetlength'].sum()
    plt.figure(figsize = (10,6))
    plt.plot_date(ndp.index,ndp/10,linestyle='solid',label='likes')
    plt.plot_date(ndp.index,ndt,linestyle='solid',label='length of tweet')
    plt.legend()
    plt.show()

def likesandcount(company,df):
    nd = df.loc[df['inferred company']==company,:]
    nd['date']=pd.to_datetime(nd['date'])
    nd =nd.set_index(nd['date'])
    g = nd.groupby(pd.Grouper(freq="M"))
    ndp =g['likes'].sum()
    ndc=g['count'].sum()
    plt.figure(figsize = (10,6))
    plt.plot_date(ndp.index,ndp/1000,linestyle='solid',label='likes')
    plt.plot_date(ndp.index,ndc,linestyle='solid',label='count')
    plt.legend()
    plt.show()

In [None]:
likesandlengths('toyota',df)

In [None]:
likesandcount('toyota',df)

In [None]:
def correlation(company,df):
    nd = df.loc[df['inferred company']==company,:]
    nd['date']=pd.to_datetime(nd['date'])
    nd.drop('id',axis =1,inplace=True)
    nd =nd.set_index(nd['date'])
    g = nd.groupby(pd.Grouper(freq="M"))
    tl = g['tweetlength'].sum()
    coun=g['count'].sum()
    like=g['likes'].sum()
    h=pd.DataFrame({'tweetlength':tl,"count":coun,"likes":like})
    
    return h.corr()

In [None]:
toyotacorr =correlation('toyota',df)
toyotacorr ##we can see that there is a high correlation between data from same company across time