-
Notifications
You must be signed in to change notification settings - Fork 0
/
NGramLibraryBuilder.java
69 lines (55 loc) · 1.91 KB
/
NGramLibraryBuilder.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class NGramLibraryBuilder {
public static class NGramMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
int noGram;
@Override
public void setup(Context context) {
//how to get n-gram from command line?
Configuration conf = context.getConfiguration();
conf.getInt("noGram", 5);
}
// map method
@Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
line = line.trim().toLowerCase();
//how to remove useless elements?
line = line.replaceAll("[^a-z]", " ");
//how to separate word by space?
String[] words = line.split("\\s+");
//the difference between \\s and \\s+
//\\s means only one space, while \\s+ means one or more than one space
//how to build n-gram based on array of words?
StringBuilder sb;
for (int i = 0; i < words.length - 1; i++) {
sb = new StringBuilder();
sb.append(words[i]);
for (int j = 1; i + j < words.length && j < noGram; j++) {
sb.append(" ");
sb.append(words[i+j]);
context.write(new Text(sb.toString().trim()), new IntWritable(1));
}
}
}
}
public static class NGramReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
// reduce method
@Override
public void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
//how to sum up the total count for each n-gram?
int sum = 0;
for (IntWritable value : values) {
sum += value.get();
}
context.write(key, new IntWritable(sum));
}
}
}