# Making various input data

In [1]:
library(lda)
library(stringr)
library(dplyr)
source('/Users/kosuke/thesis/airport_thesis/code/utils/date.R')
source('/Users/kosuke/thesis/airport_thesis/code/utils/corpus.R')


Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



#### must load

In [2]:
data          <- read.csv("/Users/kosuke/thesis/airport_thesis/data/mid/corpus/airport_review_complete.csv", stringsAsFactors=F)
review        <- read.csv("/Users/kosuke/thesis/airport_thesis/data/mid/corpus/morphed_text.csv", stringsAsFactors=F)
data$yymm     <- as.Date(apply(as.data.frame(data$datetime), 1, get_monthly))
data$review   <- review$text

target_data      <- data[data$yymm >= '2012-01-01',]
data_early_order <- target_data[order(target_data$yymm), ]

D     <- nrow(data_early_order)
bound <- round(0.9 * D) # 9割でテストデータを分ける

train_data    <- data_early_order[1:bound, ]
test_data     <- data_early_order[(bound + 1):D, ]
reversed_data <- rbind(train_data, test_data)



## Split Train-Test

#### making train-test corpus

In [7]:
vocab <- feature_select(reversed_data$review, 0)

corpus       <- create_tags_corpus(text=reversed_data$review, vocab=vocab, y=reversed_data$Recommended)
train_corpus <- corpus[corpus[,2] <= bound,]
test_corpus  <- corpus[corpus[,2] > bound, ]

In [5]:
vocab <- feature_select(reversed_data$review, 0)

corpus       <- create_tags_corpus(text=reversed_data$review, vocab=vocab, y=reversed_data$Recommended)
train_corpus <- corpus[corpus[,2] <= bound,]
test_corpus  <- corpus[corpus[,2] > bound, ]

write.csv(data.frame(vocab), 
          "/Users/kosuke/thesis/airport_thesis/data/mid/corpus/train_vocab.csv", row.names=F)
write.csv(as.data.frame(train_corpus), 
          "/Users/kosuke/thesis/airport_thesis/data/mid/corpus/train_corpus.csv", row.names=F)
write.csv(as.data.frame(test_corpus), 
          "/Users/kosuke/thesis/airport_thesis/data/mid/corpus/test_corpus.csv", row.names=F)

In [4]:
vocab <- feature_select(train_data$review, 0)

corpus       <- create_tags_corpus(text=reversed_data$review, vocab=vocab, y=reversed_data$Recommended)
train_corpus <- corpus[corpus[,2] <= bound,]
test_corpus  <- corpus[corpus[,2] > bound, ]

write.csv(data.frame(vocab), 
          "/Users/kosuke/thesis/airport_thesis/data/mid/corpus/train_vocab.csv", row.names=F)
write.csv(as.data.frame(train_corpus), 
          "/Users/kosuke/thesis/airport_thesis/data/mid/corpus/train_corpus.csv", row.names=F)
write.csv(as.data.frame(test_corpus), 
          "/Users/kosuke/thesis/airport_thesis/data/mid/corpus/test_corpus.csv", row.names=F)

## 特徴量のチューニング

In [14]:
min_count <- 10 # 任意の値にする

vocab  <- feature_select(train_data$review, min_count)
corpus <- create_tags_corpus(text=reversed_data$review, vocab=vocab, y=reversed_data$Recommended)
train_corpus <- corpus[corpus[,2] <= bound,]
test_corpus  <- corpus[corpus[,2] > bound, ]

write.csv(data.frame(vocab), "/Users/kosuke/thesis/airport_thesis/data/mid/corpus/train_vocab_10.csv", row.names=F)
write.csv(as.data.frame(train_corpus), 
          "/Users/kosuke/thesis/airport_thesis/data/mid/corpus/train_corpus_10.csv", row.names=F)
write.csv(as.data.frame(test_corpus), 
          "/Users/kosuke/thesis/airport_thesis/data/mid/corpus/test_corpus_10.csv", row.names=F)

## MPQA lexicon
http://mpqa.cs.pitt.edu/lexicons/subj_lexicon/

In [1]:
lexicon <- read.table("/Users/kosuke/cs_stat_ml/lexicon/lexicon.tff")

In [2]:
nrow(lexicon)

In [72]:
parser_word <- function(x){
    return (str_replace(x, "word1=", ""))
}

parser_polarity <- function(x){
    return (str_replace(x, "priorpolarity=", ""))
}

In [76]:
lexicon_word <- data.frame(lexicon[, 3])
lexicon_polarity <- data.frame(lexicon[, 6])

In [96]:
word <- apply(lexicon_word, 1, parser_word)
polarity <- apply(lexicon_polarity, 1, parser_polarity)

In [148]:
polarity_dic <- data.frame(word=word, polarity=polarity)
polarity_dic <- polarity_dic[polarity_dic[,2] %in% c("positive", "negative"),]

In [144]:
vocab_index <- cbind(vocab, seq(1, nrow(vocab)))
colnames(vocab_index) <- c("word", "word_index")

In [149]:
target_pol_dic <- inner_join(polarity_dic, vocab_index, by="word")

“Column `word` joining factors with different levels, coercing to character vector”

In [152]:
write.csv(unique(target_pol_dic), "/Users/kosuke/thesis/airport_thesis/data/mid/corpus/lexicon.csv", row.names=F)