In [None]:
install.packages("syuzhet")

In [None]:
install.packages("caret")

In [None]:
## Importing packages
library(tidyverse) # metapackage with lots of helpful functions
library(tidytext) # tidy implimentation of NLP methods
library(syuzhet)
library(caret)

In [None]:
# read in our data
news <- read_csv("/content/fake.csv")

In [None]:
#bs and conspiracy news are also fake
news$type<-gsub("bs","fake",news$type)
news$type<-gsub("conspiracy","fake",news$type)
#while others are real
news$type<-gsub("bias","real",news$type)
news$type<-gsub("satire","real",news$type)
news$type<-gsub("hate","real",news$type)
news$type<-gsub("junksci","real",news$type)
news$type<-gsub("state","real",news$type)

In [None]:
#Count of type of news that how many are fake and real
news %>% group_by(type) %>% summarise(count=n())

In [None]:
#apply function for finding question marks and exclamations and adding into our dataframe
news$exc <- sapply(news$text, function(x) length(unlist(strsplit(as.character(x), "\\!+")))) #count exclamation
news$que <- sapply(news$text, function(x) length(unlist(strsplit(as.character(x), "\\?+")))) #count question marks

In [None]:
##Count of exclamations in fake and real news
news %>% group_by(type) %>% summarise(exclamations=sum(exc))

In [None]:
#Count of question marks in fake and real news
news %>% group_by(type) %>% summarise(QuestionMarks=sum(que))

In [None]:
#boxplot for exclamations in fake and real news
boxplot(exc ~ type,news,ylim=c(0,20),ylab="",col=c("red","orange"))
#we can observe that fake news have more exclamations than real news

In [None]:
  #boxplot for question marks in fake and real news
  boxplot(que ~ type,news,ylim=c(0,20),col=c("red","orange"))
  #we can observe that fake news have more question marks than real

In [None]:
#function for finding words in each text
terms<- function(fake, text_column, group_column){

  group_column <- enquo(group_column)
  text_column <- enquo(text_column)

  # get the count of each word in each review
  words <- news %>%
    unnest_tokens(word, !!text_column) %>%
    count(!!group_column, word) %>%
    ungroup()

  # get the number of words per text
  #total_words <- words %>%
    #group_by(!!group_column) %>%
    #summarize(total = sum(n))

  # combine the two dataframes we just made

  return (words)
}

In [None]:
#store all words per text in different data frame
df<-terms(news,text,type)

In [None]:
#create boxplot for number of words of each type
boxplot(n ~ type,df,log="y",xlab="type",ylab="number of words",col=c("green","pink"))

In [None]:
#create sentiment table for text column
sentiment<-get_nrc_sentiment(news$text)
sentiment

In [None]:
#taking only last two columns negative and positive for the analysis
df1<-sentiment[c(9,10)]

In [None]:
#function for normalization
normalize <- function(x) {
    return ((x - min(x)) / (max(x) - min(x)))
  }

In [None]:
#normalize negative and positive column for better analysis means the values will lie between 0 and 1
df1$negative<-normalize(df1$negative)
df1$positive<-normalize(df1$positive)

In [None]:
#Combine this with the news dataset
news<-cbind(news,df1)

In [None]:
#finding standard deviations and median of negative and positive columns for each type of news
neg_sd<-news %>% group_by(type) %>% summarise(neg_sd=sd(negative))
pos_sd<-news %>% group_by(type) %>% summarise(pos_sd=sd(positive))
neg_med<-news %>% group_by(type) %>% summarise(neg_med=median(negative))
pos_med<-news %>% group_by(type) %>% summarise(pos_med=median(positive))

In [None]:
#create dataframes for negative and positive standard deviations and median
dfr2<-data.frame(neg_sd)
dfr1<-data.frame(pos_sd)
dfr3<-data.frame(neg_med)
dfr4<-data.frame(pos_med)

In [None]:
t1<-merge(dfr1,dfr2)
t2<-t(t1)
t2

In [None]:
install.packages("caret")

In [None]:
install.packages("tm")


In [None]:
install.packages("SnowballC")

In [None]:
install.packages("e1071")

In [None]:
install.packages("caTools")

In [None]:
library(tm)         # For text processing and document-term matrix
library(SnowballC)  # For text stemming
library(caTools)    # For data splitting
library(caret)      # For model evaluation
library(tidyverse)  # For data manipulation

In [None]:
news_data <- read.csv("/content/fake_or_real_news.csv", stringsAsFactors = FALSE)


In [None]:
corpus <- VCorpus(VectorSource(news_data$text))

# Clean the text
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, stemDocument)

In [None]:
dtm <- DocumentTermMatrix(corpus)

In [None]:
news_data$label <- as.factor(ifelse(news_data$label == "FAKE", 0, 1))

# Data splitting
set.seed(123)
split <- sample.split(news_data$label, SplitRatio = 0.75)
train_indices <- which(split == TRUE)
test_indices <- which(split == FALSE)


In [None]:
train_dtm <- dtm[train_indices,]
test_dtm <- dtm[test_indices,]

# Also split the labels accordingly
train_labels <- news_data$label[train_indices]
test_labels <- news_data$label[test_indices]

In [None]:
library(e1071)
model <- svm(as.matrix(train_dtm), train_labels, type = 'C-classification', kernel = 'linear')